PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

xinference 0.14.4.post1py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import asyncio
-import json
 import logging
 import multiprocessing
 import os
@@ -24,9 +23,9 @@ from typing import (
     Any,
     AsyncGenerator,
     Dict,
-    Iterable,
     List,
     Optional,
+    Tuple,
     TypedDict,
     Union,
 )
@@ -34,18 +33,20 @@ from typing import (
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
     LoRA,
-    ToolCallFunction,
-    ToolCalls,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
-from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import (
+    QWEN_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_SYMBOLS,
+    ChatModelMixin,
+    generate_completion_chunk,
+)
 logger = logging.getLogger(__name__)
@@ -363,23 +364,28 @@ class VLLMModel(LLM):
     @staticmethod
     def _convert_request_output_to_completion_chunk(
         request_id: str, model: str, request_output: "RequestOutput"
-    ) -> CompletionChunk:
+    ) -> Tuple[CompletionChunk, Optional[str]]:
         choices: List[CompletionChoice] = []
+        finish_reason = None
         for output in request_output.outputs:
             choices.append(
                 CompletionChoice(
                     text=output.text,
                     index=output.index,
                     logprobs=None,  # TODO: support logprobs.
-                    finish_reason=output.finish_reason,
+                    finish_reason=None,
                 )
             )
-        return CompletionChunk(
-            id=request_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=model,
-            choices=choices,
+            finish_reason = output.finish_reason
+        return (
+            CompletionChunk(
+                id=request_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=model,
+                choices=choices,
+            ),
+            finish_reason,
         )
     @staticmethod
@@ -420,6 +426,7 @@ class VLLMModel(LLM):
         prompt: Union[str, Dict[str, Any]],
         generate_config: Optional[Dict] = None,
         tools: object = False,
+        request_id: Optional[str] = None,
     ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
         try:
             from vllm.sampling_params import SamplingParams
@@ -454,7 +461,8 @@ class VLLMModel(LLM):
             else False
         )
         sampling_params = SamplingParams(**sanitized_generate_config)
-        request_id = str(uuid.uuid1())
+        if not request_id:
+            request_id = str(uuid.uuid1())
         assert self._engine is not None
         results_generator = self._engine.generate(
@@ -463,10 +471,14 @@ class VLLMModel(LLM):
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
-            tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
             prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+            complete_response = ""
+            match_tool_call_tmp_results = []
+            is_match_tool_call = False
+            chunk = None
+            finish_reason = None
             async for _request_output in results_generator:
-                chunk = self._convert_request_output_to_completion_chunk(
+                chunk, finish_reason = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
                     model=self.model_uid,
                     request_output=_request_output,
@@ -476,40 +488,8 @@ class VLLMModel(LLM):
                     delta = choice["text"][len(previous_texts[i]) :]
                     previous_texts[i] = choice["text"]
                     choice["text"] = delta
+                    complete_response += delta
-                if tools:
-                    # only handle the first choice
-                    choice = chunk["choices"][0]
-                    if choice["finish_reason"] is not None:
-                        # use previous text for evaluation temporarily
-                        choice_delta = choice["text"]
-                        choice["text"] = previous_texts[0]
-                        _content, func, args = ChatModelMixin._eval_tool_arguments(
-                            self.model_family, chunk, tools
-                        )
-                        choice["text"] = tools_token_filter(
-                            tokens=previous_texts[0], delta=choice_delta
-                        )
-                        if func is not None:
-                            choice["text"] = None
-                            choice["finish_reason"] = "tool_calls"
-                            choice["tool_calls"] = [
-                                ToolCalls(
-                                    id=str(uuid.uuid4()),
-                                    type="function",
-                                    function=ToolCallFunction(
-                                        name=func,
-                                        arguments=json.dumps(args, ensure_ascii=False),
-                                    ),
-                                )
-                            ]
-                    else:
-                        # use a filter function to skip Qwen's react thought process
-                        choice["text"] = tools_token_filter(
-                            tokens=previous_texts[0], delta=choice["text"]
-                        )
-                        if not choice["text"]:
-                            continue
                 prompt_tokens = len(_request_output.prompt_token_ids)
                 completion_tokens = sum(
                     len(output.token_ids) for output in _request_output.outputs
@@ -520,7 +500,59 @@ class VLLMModel(LLM):
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
                 )
+                if tools:
+                    """
+                    The qwen2 tool call returns format like this:
+                    <tool_call>
+                    {...}
+                    </tool_call>
+                    Here is to match this.
+                    """
+                    if (len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)) and (
+                        not QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response)
+                    ):
+                        for c in match_tool_call_tmp_results:
+                            yield c
+                        match_tool_call_tmp_results.clear()
+                        yield chunk
+                    elif (len(QWEN_TOOL_CALL_SYMBOLS[0]) > len(complete_response)) and (
+                        QWEN_TOOL_CALL_SYMBOLS[0].startswith(complete_response)
+                    ):
+                        match_tool_call_tmp_results.append(chunk)
+                    else:
+                        assert len(QWEN_TOOL_CALL_SYMBOLS[0]) <= len(complete_response)
+                        if not is_match_tool_call and complete_response.startswith(
+                            QWEN_TOOL_CALL_SYMBOLS[0]
+                        ):
+                            is_match_tool_call = True
+                            match_tool_call_tmp_results.clear()
+                        if not is_match_tool_call:
+                            for c in match_tool_call_tmp_results:
+                                yield c
+                            match_tool_call_tmp_results.clear()
+                            yield chunk
+                        else:
+                            chunk["choices"][0]["text"] = complete_response
+                else:
+                    yield chunk
+            if is_match_tool_call:
+                assert chunk is not None
                 yield chunk
+            # match OpenAI API stream
+            yield generate_completion_chunk(
+                chunk_text="",
+                finish_reason=finish_reason,
+                chunk_id=request_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
             if include_usage:
                 chunk = CompletionChunk(
                     id=request_id,
@@ -586,59 +618,74 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Dict:
         if not generate_config:
             generate_config = {}
-        if self.model_family.prompt_style:
-            if (
-                not generate_config.get("stop")
-            ) and self.model_family.prompt_style.stop:
-                generate_config["stop"] = self.model_family.prompt_style.stop.copy()
-            if self.model_family.prompt_style.stop_token_ids:
-                generate_config.setdefault(
-                    "stop_token_ids",
-                    self.model_family.prompt_style.stop_token_ids.copy(),
-                )
+        if not generate_config.get("stop") and self.model_family.stop:
+            generate_config["stop"] = self.model_family.stop.copy()
+        if (
+            not generate_config.get("stop_token_ids")
+            and self.model_family.stop_token_ids
+        ):
+            generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
         return generate_config
+    @staticmethod
+    def is_tool_call_chunk(chunk):
+        return chunk["choices"][0]["text"].startswith(QWEN_TOOL_CALL_SYMBOLS[0])
+    async def _async_to_tool_completion_chunks(
+        self,
+        chunks: AsyncGenerator[CompletionChunk, None],
+    ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        i = 0
+        async for chunk in chunks:
+            if i == 0:
+                yield self._get_first_chat_completion_chunk(chunk)
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield self._get_final_chat_completion_chunk(chunk)
+            else:
+                if self.is_tool_call_chunk(chunk):
+                    yield self._tool_calls_completion_chunk(
+                        self.model_family, self.model_uid, chunk
+                    )
+                else:
+                    yield self._to_chat_completion_chunk(chunk)
+            i += 1
     async def async_chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[Dict] = None,
+        request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
-        generate_config = self._sanitize_chat_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
+        full_context_kwargs = {}
         if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = list(stop) + ["Observation:"]
-            else:
-                generate_config["stop"] = "Observation:"
+            full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
+        generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(full_prompt, generate_config, tools)
+            agen = await self.async_generate(
+                full_prompt, generate_config, tools, request_id=request_id
+            )
             assert isinstance(agen, AsyncGenerator)
+            if tools:
+                return self._async_to_tool_completion_chunks(agen)
             return self._async_to_chat_completion_chunks(agen)
         else:
-            c = await self.async_generate(full_prompt, generate_config)
+            c = await self.async_generate(
+                full_prompt, generate_config, request_id=request_id
+            )
             assert not isinstance(c, AsyncGenerator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)
@@ -666,28 +713,30 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         self,
         generate_config: Optional[Dict] = None,
     ) -> Dict:
+        from ..utils import get_stop_token_ids_from_config_file
         if not generate_config:
             generate_config = {}
-        if self.model_family.prompt_style:
-            if self.model_family.prompt_style.stop_token_ids:
-                generate_config.setdefault(
-                    "stop_token_ids",
-                    self.model_family.prompt_style.stop_token_ids.copy(),
-                )
+        if generate_config.get("stop_token_ids", None) is None:
+            stop_token_ids = get_stop_token_ids_from_config_file(self.model_path)
+            if stop_token_ids is not None:
+                generate_config.setdefault("stop_token_ids", stop_token_ids)
+            else:
+                if self.model_family.stop_token_ids:
+                    generate_config.setdefault(
+                        "stop_token_ids", self.model_family.stop_token_ids.copy()
+                    )
         return generate_config
     async def async_chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[Dict] = None,
+        request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         # only support single image, waiting vllm support multi images
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        chat_history = chat_history or []
-        prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
+        model_family = self.model_family.model_family or self.model_family.model_name
+        prompt, images = self.get_specific_prompt(model_family, messages)
         if len(images) == 0:
             inputs = {
@@ -703,10 +752,14 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(inputs, generate_config)
+            agen = await self.async_generate(
+                inputs, generate_config, request_id=request_id
+            )
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen)
         else:
-            c = await self.async_generate(inputs, generate_config)
+            c = await self.async_generate(
+                inputs, generate_config, request_id=request_id
+            )
             assert not isinstance(c, AsyncGenerator)
             return self._to_chat_completion(c)

xinference/model/rerank/core.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import gc
 import logging
 import os
+import threading
 import uuid
 from collections import defaultdict
 from collections.abc import Sequence
@@ -22,6 +23,7 @@ from typing import Dict, List, Literal, Optional, Tuple
 import numpy as np
 import torch
+import torch.nn as nn
 from ...constants import XINFERENCE_CACHE_DIR
 from ...device_utils import empty_cache
@@ -49,6 +51,7 @@ class RerankModelSpec(CacheableModelSpec):
     model_name: str
     language: List[str]
     type: Optional[str] = "unknown"
+    max_tokens: Optional[int]
     model_id: str
     model_revision: Optional[str]
     model_hub: str = "huggingface"
@@ -102,6 +105,30 @@ def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[D
     return res
+class _ModelWrapper:
+    def __init__(self, module: nn.Module):
+        self._module = module
+        self._local_data = threading.local()
+    @property
+    def n_tokens(self):
+        return getattr(self._local_data, "n_tokens", 0)
+    @n_tokens.setter
+    def n_tokens(self, new_n_tokens):
+        self._local_data.n_tokens = new_n_tokens
+    def __getattr__(self, attr):
+        return getattr(self._module, attr)
+    def __call__(self, **kwargs):
+        attention_mask = kwargs["attention_mask"]
+        # when batching, the attention mask 1 means there is a token
+        # thus we just sum up it to get the total number of tokens
+        self.n_tokens += attention_mask.sum().item()
+        return self._module(**kwargs)
 class RerankModel:
     def __init__(
         self,
@@ -166,6 +193,7 @@ class RerankModel:
                 self._model_path,
                 device=self._device,
                 trust_remote_code=True,
+                max_length=getattr(self._model_spec, "max_tokens"),
                 **self._model_config,
             )
             if self._use_fp16:
@@ -189,6 +217,8 @@ class RerankModel:
                 raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
             self._model = FlagReranker(self._model_path, use_fp16=self._use_fp16)
+        # Wrap transformers model to record number of tokens
+        self._model.model = _ModelWrapper(self._model.model)
     def rerank(
         self,
@@ -200,17 +230,14 @@ class RerankModel:
         return_len: Optional[bool],
         **kwargs,
     ) -> Rerank:
-        self._counter += 1
-        if self._counter % RERANK_EMPTY_CACHE_COUNT == 0:
-            logger.debug("Empty rerank cache.")
-            gc.collect()
-            empty_cache()
         assert self._model is not None
         if kwargs:
             raise ValueError("rerank hasn't support extra parameter.")
         if max_chunks_per_doc is not None:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
         sentence_combinations = [[query, doc] for doc in documents]
+        # reset n tokens
+        self._model.model.n_tokens = 0
         if self._model_spec.type == "normal":
             similarity_scores = self._model.predict(
                 sentence_combinations, convert_to_numpy=False, convert_to_tensor=True
@@ -245,9 +272,7 @@ class RerankModel:
                 for arg in sim_scores_argsort
             ]
         if return_len:
-            tokenizer = self._get_tokenizer(self._model_path)
-            input_len = sum([len(tokenizer.tokenize(t)) for t in documents])
+            input_len = self._model.model.n_tokens
             # Rerank Model output is just score or documents
             # while return_documents = True
             output_len = input_len
@@ -265,6 +290,14 @@ class RerankModel:
             "warnings": None,
         }
+        del similarity_scores
+        # clear cache if possible
+        self._counter += 1
+        if self._counter % RERANK_EMPTY_CACHE_COUNT == 0:
+            logger.debug("Empty rerank cache.")
+            gc.collect()
+            empty_cache()
         return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata)

xinference/model/rerank/model_spec.json CHANGED Viewed

@@ -3,6 +3,7 @@
     "model_name": "bge-reranker-large",
     "type": "normal",
     "language": ["en", "zh"],
+    "max_tokens": 512,
     "model_id": "BAAI/bge-reranker-large",
     "model_revision": "27c9168d479987529781de8474dff94d69beca11"
   },
@@ -10,6 +11,7 @@
     "model_name": "bge-reranker-base",
     "type": "normal",
     "language": ["en", "zh"],
+    "max_tokens": 512,
     "model_id": "BAAI/bge-reranker-base",
     "model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
   },
@@ -17,6 +19,7 @@
     "model_name": "bce-reranker-base_v1",
     "type": "normal",
     "language": ["en", "zh"],
+    "max_tokens": 512,
     "model_id": "maidalun1020/bce-reranker-base_v1",
     "model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
   },
@@ -24,6 +27,7 @@
     "model_name": "bge-reranker-v2-m3",
     "type": "normal",
     "language": ["en", "zh", "multilingual"],
+    "max_tokens": 8192,
     "model_id": "BAAI/bge-reranker-v2-m3",
     "model_revision": "12e974610ba9083ed95f3edf08d7e899581f4de4"
   },
@@ -31,6 +35,7 @@
     "model_name": "bge-reranker-v2-gemma",
     "type": "LLM-based",
     "language": ["en", "zh", "multilingual"],
+    "max_tokens": 8192,
     "model_id": "BAAI/bge-reranker-v2-gemma",
     "model_revision": "1787044f8b6fb740a9de4557c3a12377f84d9e17"
   },
@@ -38,6 +43,7 @@
     "model_name": "bge-reranker-v2-minicpm-layerwise",
     "type": "LLM-based layerwise",
     "language": ["en", "zh", "multilingual"],
+    "max_tokens": 2048,
     "model_id": "BAAI/bge-reranker-v2-minicpm-layerwise",
     "model_revision": "47b5332b296c4d8cb6ee2c60502cc62a0d708881"
   },
@@ -45,6 +51,7 @@
     "model_name": "jina-reranker-v2",
     "type": "normal",
     "language": ["en", "zh", "multilingual"],
+    "max_tokens": 1024,
     "model_id": "jinaai/jina-reranker-v2-base-multilingual",
     "model_revision": "298e48cada4a9318650d7fbd795f63827f884087"
   }

xinference/model/rerank/model_spec_modelscope.json CHANGED Viewed

@@ -3,6 +3,7 @@
     "model_name": "bge-reranker-base",
     "type": "normal",
     "language": ["en", "zh"],
+    "max_tokens": 512,
     "model_id": "Xorbits/bge-reranker-base",
     "model_revision": "v0.0.1",
     "model_hub": "modelscope"
@@ -11,6 +12,7 @@
     "model_name": "bge-reranker-large",
     "type": "normal",
     "language": ["en", "zh"],
+    "max_tokens": 512,
     "model_id": "Xorbits/bge-reranker-large",
     "model_revision": "v0.0.1",
     "model_hub": "modelscope"
@@ -19,6 +21,7 @@
     "model_name": "bce-reranker-base_v1",
     "type": "normal",
     "language": ["en", "zh"],
+    "max_tokens": 512,
     "model_id": "maidalun/bce-reranker-base_v1",
     "model_revision": "v0.0.1",
     "model_hub": "modelscope"
@@ -26,6 +29,7 @@
   {
     "model_name": "bge-reranker-v2-m3",
     "type": "normal",
+    "max_tokens": 8192,
     "language": ["en", "zh", "multilingual"],
     "model_id": "AI-ModelScope/bge-reranker-v2-m3",
     "model_hub": "modelscope"
@@ -34,6 +38,7 @@
     "model_name": "bge-reranker-v2-gemma",
     "type": "LLM-based",
     "language": ["en", "zh", "multilingual"],
+    "max_tokens": 8192,
     "model_id": "AI-ModelScope/bge-reranker-v2-gemma",
     "model_hub": "modelscope"
   },
@@ -41,7 +46,8 @@
     "model_name": "bge-reranker-v2-minicpm-layerwise",
     "type": "LLM-based layerwise",
     "language": ["en", "zh", "multilingual"],
-    "model_id": "zfffff/bge-reranker-v2-minicpm-layerwise",
+    "max_tokens": 2048,
+    "model_id": "mirror013/bge-reranker-v2-minicpm-layerwise",
     "model_hub": "modelscope"
   }
 ]

xinference/model/utils.py CHANGED Viewed

@@ -11,10 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import gc
-import inspect
 import json
 import logging
 import os
@@ -28,7 +24,7 @@ import numpy as np
 import torch
 from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
-from ..device_utils import empty_cache, get_available_device, is_device_available
+from ..device_utils import get_available_device, is_device_available
 from .core import CacheableModelSpec
 logger = logging.getLogger(__name__)
@@ -357,32 +353,6 @@ def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
         return str(model_size)
-def ensure_cache_cleared(func: Callable):
-    assert not inspect.iscoroutinefunction(func) and not inspect.isasyncgenfunction(
-        func
-    )
-    if inspect.isgeneratorfunction(func):
-        @functools.wraps(func)
-        def inner(*args, **kwargs):
-            for obj in func(*args, **kwargs):
-                yield obj
-            gc.collect()
-            empty_cache()
-    else:
-        @functools.wraps(func)
-        def inner(*args, **kwargs):
-            try:
-                return func(*args, **kwargs)
-            finally:
-                gc.collect()
-                empty_cache()
-    return inner
 def set_all_random_seed(seed: int):
     random.seed(seed)
     np.random.seed(seed)

xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

xinference 0.14.4.post1py3-none-any.whl → 0.15.0py3-none-any.whl