PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -191,42 +191,13 @@ def _get_pad_param(seq_len_idx: int, pad_len: int) -> Tuple:
     return tuple(dimensions)
-def _merge_kv_cache(
-    xinf_model_obj: "PytorchModel",
-    past_cache: DynamicCache,
-    new_cache: DynamicCache,
-) -> DynamicCache:
-    from torch.nn.functional import pad
-    _, seq_len_idx = xinf_model_obj.get_batch_size_and_seq_len_indexes_from_kv()
-    past_seq_len = past_cache[0][0].shape[seq_len_idx]
-    new_seq_len = new_cache[0][0].shape[seq_len_idx]
-    if past_seq_len != new_seq_len:
-        padding_target = new_cache if past_seq_len > new_seq_len else past_cache
-        padding_len = abs(past_seq_len - new_seq_len)
-        pad_param = _get_pad_param(seq_len_idx, padding_len)
-        for idx in range(len(padding_target)):
-            k = padding_target.key_cache[idx]
-            v = padding_target.value_cache[idx]
-            _k = pad(k, pad_param)
-            _v = pad(v, pad_param)
-            padding_target.key_cache[idx] = _k
-            padding_target.value_cache[idx] = _v
-    ret_kv = DynamicCache()
-    for idx in range(len(past_cache)):
-        k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
-        v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
-        ret_kv.update(
-            torch.cat((k1, k2), 0).contiguous(),
-            torch.cat((v1, v2), 0).contiguous(),
-            idx,
-        )
-    return ret_kv
 def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel"):
+    from transformers import HybridCache
     bs_idx, seq_len_idx = xinf_model_obj.get_batch_size_and_seq_len_indexes_from_kv()
+    if isinstance(kv, HybridCache):
+        return kv.key_cache[0].shape[bs_idx], kv.get_seq_length()
     return kv[0][0].shape[bs_idx], kv[0][0].shape[seq_len_idx] + 1
@@ -304,9 +275,7 @@ def _batch_inference_one_step_internal(
         if decode_reqs:
             decode_kv = decode_reqs[0].kv_cache
             # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
-            merged_kv_cache = _merge_kv_cache(
-                xinf_model_obj, decode_kv, past_key_values
-            )
+            merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
             for r in valid_req_list:
                 r.kv_cache = merged_kv_cache
             empty_cache()

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -199,7 +199,11 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v3")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v3-0324")
     VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-0528")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-prover-v2")
+    VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-0528-qwen3")
 if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")

xinference/model/rerank/core.py CHANGED Viewed

@@ -265,7 +265,13 @@ class RerankModel:
         if max_chunks_per_doc is not None:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
         logger.info("Rerank with kwargs: %s, model: %s", kwargs, self._model)
-        sentence_combinations = [[query, doc] for doc in documents]
+        from .utils import preprocess_sentence
+        pre_query = preprocess_sentence(
+            query, kwargs.get("instruction", None), self._model_spec.model_name
+        )
+        sentence_combinations = [[pre_query, doc] for doc in documents]
         # reset n tokens
         self._model.model.n_tokens = 0
         if self._model_spec.type == "normal":

xinference/model/rerank/utils.py CHANGED Viewed

@@ -11,8 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any
 from .core import RerankModelSpec
 def get_model_version(rerank_model: RerankModelSpec) -> str:
     return rerank_model.model_name
+instruction_cfg = {
+    "minicpm-reranker": "Query: ",
+}
+def preprocess_sentence(query: str, instruction: Any, model_name: str) -> str:
+    if instruction and isinstance(instruction, str):
+        return f"{instruction}{query}"
+    if instruction is None:
+        for k, v in instruction_cfg.items():
+            if k.lower() in model_name.lower():
+                return f"{v}{query}"
+    return query

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.337afe76.css",
-    "main.js": "./static/js/main.ae579a97.js",
+    "main.js": "./static/js/main.ddf9eaee.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.337afe76.css.map": "./static/css/main.337afe76.css.map",
-    "main.ae579a97.js.map": "./static/js/main.ae579a97.js.map"
+    "main.ddf9eaee.js.map": "./static/js/main.ddf9eaee.js.map"
   },
   "entrypoints": [
     "static/css/main.337afe76.css",
-    "static/js/main.ae579a97.js"
+    "static/js/main.ddf9eaee.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~ae579a97~~.js"></script><link href="./static/css/main.337afe76.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.ddf9eaee.js"></script><link href="./static/css/main.337afe76.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.6.1py3-none-any.whl