PyPI - xinference - Versions diffs - 0.16.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

xinference 0.16.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (60) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -363,6 +363,97 @@
       "<|eom_id|>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "llama-3.2-vision-instruct",
+    "model_lang": [
+      "en",
+      "de",
+      "fr",
+      "it",
+      "pt",
+      "hi",
+      "es",
+      "th"
+    ],
+    "model_ability": [
+	"chat",
+	"vision"
+    ],
+    "model_description": "Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image...",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 11,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "LLM-Research/Llama-3.2-11B-Vision-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 90,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "LLM-Research/Llama-3.2-90B-Vision-Instruct",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+    "stop_token_ids": [
+	128001,
+	128008,
+	128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+	"<|eot_id|>",
+	"<|eom_id|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "llama-3.2-vision",
+    "model_lang": [
+      "en",
+      "de",
+      "fr",
+      "it",
+      "pt",
+      "hi",
+      "es",
+      "th"
+    ],
+    "model_ability": [
+	"generate",
+	"vision"
+    ],
+    "model_description": "The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image...",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 11,
+        "quantizations": [
+          "none"
+        ],
+          "model_id": "LLM-Research/Llama-3.2-11B-Vision",
+	  "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 90,
+        "quantizations": [
+          "none"
+        ],
+          "model_id": "LLM-Research/Llama-3.2-90B-Vision",
+	  "model_hub": "modelscope"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -5816,6 +5907,18 @@
     ],
     "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
     "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": "1_5",
@@ -5828,6 +5931,18 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "3",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 7,
@@ -5839,6 +5954,30 @@
         "model_id": "qwen/Qwen2.5-Coder-7B",
         "model_revision": "master",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
       }
     ]
   },
@@ -5856,6 +5995,18 @@
     ],
     "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
     "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": "1_5",
@@ -5867,6 +6018,17 @@
         "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct",
         "model_revision": "master",
         "model_hub": "modelscope"
+      },      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "3",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
       },
       {
         "model_format": "pytorch",
@@ -5880,6 +6042,63 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 7,
@@ -5891,6 +6110,89 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "1_5",

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -17,7 +17,8 @@ import platform
 import sys
 import time
 import uuid
-from typing import Dict, Iterator, List, Optional, TypedDict, Union
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union
 from ....fields import max_tokens_field
 from ....types import (
@@ -53,6 +54,14 @@ class MLXGenerateConfig(TypedDict, total=False):
     stream: bool
     stream_options: Optional[Union[dict, None]]
     tools: Optional[List[Dict]]
+    lora_name: Optional[str]
+@dataclass
+class PromptCache:
+    cache: List[Any] = field(default_factory=list)
+    model_key: Tuple[str, Optional[str]] = ("", None)
+    tokens: List[int] = field(default_factory=list)
 class MLXModel(LLM):
@@ -69,6 +78,8 @@ class MLXModel(LLM):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._use_fast_tokenizer = True
         self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
+        self._max_kv_size = None
+        self._prompt_cache = None
         if peft_model is not None:
             raise ValueError("MLX engine has not supported lora yet")
@@ -127,6 +138,9 @@ class MLXModel(LLM):
             logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
             mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
+        self._max_kv_size = kwargs.get("max_kv_size", None)
+        self._prompt_cache = PromptCache()
         return load(
             self.model_path,
             tokenizer_config=tokenizer_config,
@@ -156,6 +170,27 @@ class MLXModel(LLM):
             return False
         return True
+    def _get_prompt_cache(self, prompt, lora_name: Optional[str] = None):
+        from mlx_lm.models.cache import make_prompt_cache
+        assert self._prompt_cache is not None
+        cache_len = len(self._prompt_cache.tokens)
+        model_key = (self.model_path, lora_name)
+        if (
+            self._prompt_cache.model_key != model_key
+            or cache_len >= len(prompt)
+            or self._prompt_cache.tokens != prompt[:cache_len]
+        ):
+            self._prompt_cache.model_key = model_key
+            self._prompt_cache.cache = make_prompt_cache(self._model, self._max_kv_size)
+            self._prompt_cache.tokens = []
+            logger.debug("Making new prompt cache for %s", self.model_uid)
+        else:
+            prompt = prompt[cache_len:]
+            logger.debug("Cache hit for %s", self.model_uid)
+        self._prompt_cache.tokens.extend(prompt)
+        return prompt
     def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
         import mlx.core as mx
         from mlx_lm.utils import generate_step
@@ -167,6 +202,7 @@ class MLXModel(LLM):
         chunk_id = str(uuid.uuid4())
         stop_token_ids = kwargs.get("stop_token_ids", [])
         stream = kwargs.get("stream", False)
+        lora_name = kwargs.get("lora_name")
         stream_options = kwargs.pop("stream_options", None)
         include_usage = (
             stream_options["include_usage"]
@@ -174,12 +210,15 @@ class MLXModel(LLM):
             else False
         )
-        prompt_tokens = mx.array(tokenizer.encode(prompt))
+        prompt_token_ids = tokenizer.encode(prompt)
+        prompt_token_ids = self._get_prompt_cache(prompt_token_ids, lora_name)
+        prompt_tokens = mx.array(prompt_token_ids)
         input_echo_len = len(prompt_tokens)
         i = 0
         start = time.time()
         output = ""
+        tokens = []
         for (token, _), i in zip(
             generate_step(
                 prompt_tokens,
@@ -189,9 +228,11 @@ class MLXModel(LLM):
                 repetition_context_size=kwargs["repetition_context_size"],
                 top_p=kwargs["top_p"],
                 logit_bias=kwargs["logit_bias"],
+                prompt_cache=self._prompt_cache.cache,  # type: ignore
             ),
             range(max_tokens),
         ):
+            tokens.append(token)
             if token == tokenizer.eos_token_id or token in stop_token_ids:  # type: ignore
                 break
@@ -230,6 +271,8 @@ class MLXModel(LLM):
             f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
         )
+        self._prompt_cache.tokens.extend(tokens)  # type: ignore
         if i == max_tokens - 1:
             finish_reason = "length"
         else:

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -163,7 +163,6 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
 if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
     VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")
@@ -177,6 +176,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
+    VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")

xinference/model/rerank/core.py CHANGED Viewed

@@ -179,6 +179,7 @@ class RerankModel:
         return rerank_type
     def load(self):
+        logger.info("Loading rerank model: %s", self._model_path)
         flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
         if (
             self._auto_detect_type(self._model_path) != "normal"
@@ -189,6 +190,7 @@ class RerankModel:
                 "will force set `use_fp16` to True"
             )
             self._use_fp16 = True
         if self._model_spec.type == "normal":
             try:
                 import sentence_transformers
@@ -250,22 +252,27 @@ class RerankModel:
         **kwargs,
     ) -> Rerank:
         assert self._model is not None
-        if kwargs:
-            raise ValueError("rerank hasn't support extra parameter.")
         if max_chunks_per_doc is not None:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
+        logger.info("Rerank with kwargs: %s, model: %s", kwargs, self._model)
         sentence_combinations = [[query, doc] for doc in documents]
         # reset n tokens
         self._model.model.n_tokens = 0
         if self._model_spec.type == "normal":
             similarity_scores = self._model.predict(
-                sentence_combinations, convert_to_numpy=False, convert_to_tensor=True
+                sentence_combinations,
+                convert_to_numpy=False,
+                convert_to_tensor=True,
+                **kwargs,
             ).cpu()
             if similarity_scores.dtype == torch.bfloat16:
                 similarity_scores = similarity_scores.float()
         else:
             # Related issue: https://github.com/xorbitsai/inference/issues/1775
-            similarity_scores = self._model.compute_score(sentence_combinations)
+            similarity_scores = self._model.compute_score(
+                sentence_combinations, **kwargs
+            )
             if not isinstance(similarity_scores, Sequence):
                 similarity_scores = [similarity_scores]
             elif (

xinference 0.16.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

xinference 0.16.2py3-none-any.whl → 1.0.0py3-none-any.whl