PyPI - xinference - Versions diffs - 1.11.0__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl - Mend

xinference 1.11.0py3-none-any.whl → 1.11.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (17) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-10-19T20:53:12+0800",
+ "date": "2025-10-20T18:17:30+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "baaa40b463e4948762b078f5995d67775df53704",
- "version": "1.11.0"
+ "full-revisionid": "378b99185de5a7623f75798df7e4391f4ff39e35",
+ "version": "1.11.0.post1"
 }
 '''  # END VERSION_JSON

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -549,46 +549,30 @@ class PytorchModel(LLM):
         So we need pad `0` on the left again.
         """
         data = []
-        # For decode phase, attention mask should match the full KV cache sequence length
-        # All requests in batch should have attention mask of length `seq_length`
-        for r in reqs:
-            # Get the actual sequence length for this request from its tracking
-            if "attention_mask_seq_len" not in r.extra_kwargs:
-                # Initialize with the current sequence length (full KV cache length)
-                r.extra_kwargs["attention_mask_seq_len"] = seq_length
-            else:
-                # Use the previously tracked length, but ensure it doesn't exceed current seq_length
-                tracked_len = r.extra_kwargs["attention_mask_seq_len"]
-                r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
-        # For decode phase after KV cache merge, all requests should have attention mask
-        # that matches the merged sequence length
+        max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
         for r in reqs:
+            r.extra_kwargs["attention_mask_seq_len"] += 1
             real_len = r.extra_kwargs["attention_mask_seq_len"]
+            pad_len = max_len - real_len
-            # The attention mask should cover the full sequence length
-            if real_len < seq_length:
-                # Pad with zeros on the left to reach full sequence length
-                pad_len = seq_length - real_len
-                if self._tokenizer.padding_side == "left":
-                    x = torch.cat(
-                        [
-                            torch.full((pad_len,), 0, dtype=torch.long),
-                            torch.ones((real_len,), dtype=torch.long),
-                        ]
-                    )
-                else:
-                    x = torch.cat(
-                        [
-                            torch.ones((real_len,), dtype=torch.long),
-                            torch.full((pad_len,), 0, dtype=torch.long),
-                        ]
-                    )
+            if self._tokenizer.padding_side == "left":
+                x = torch.cat(
+                    [
+                        (
+                            torch.full((pad_len,), 0, dtype=torch.long)
+                            if pad_len > 0
+                            else torch.tensor([], dtype=torch.long)
+                        ),
+                        torch.ones((real_len,), dtype=torch.long),
+                    ]
+                )
             else:
-                # Already at correct length
-                x = torch.ones((real_len,), dtype=torch.long)
+                x = torch.cat(
+                    [
+                        torch.ones((real_len,), dtype=torch.long),
+                        torch.full((pad_len,), 0, dtype=torch.long),
+                    ]
+                )
             data.append(x)
         return torch.stack(data).to(self._device)

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -285,30 +285,10 @@ def _batch_inference_one_step_internal(
             # This prevents batch size mismatches during merging
             decode_kv = decode_reqs[0].kv_cache
-            # Verify that all decode requests share the same kv_cache
-            for req in decode_reqs[1:]:
-                if req.kv_cache is not decode_kv:
-                    logger.warning(
-                        "Inconsistent kv_cache references detected in decode requests. "
-                        "This may indicate a batching synchronization issue."
-                    )
-                    # Use the first decode_kv as the reference to maintain consistency
-                    req.kv_cache = decode_kv
             # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
             merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
-            # Update sequence length information after KV cache merge
-            _, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
-                merged_kv_cache, xinf_model_obj
-            )
             for r in valid_req_list:
                 r.kv_cache = merged_kv_cache
-                # Update attention mask sequence length to match merged KV cache
-                if "attention_mask_seq_len" in r.extra_kwargs:
-                    # Ensure the attention mask length doesn't exceed the merged sequence length
-                    r.extra_kwargs["attention_mask_seq_len"] = min(
-                        r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
-                    )
             empty_cache()
         else:
             for r in valid_req_list:

xinference/ui/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.5ea97072.css",
-    "main.js": "./static/js/main.45e78536.js",
+    "main.js": "./static/js/main.e4d9a9e1.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.5ea97072.css.map": "./static/css/main.5ea97072.css.map",
-    "main.45e78536.js.map": "./static/js/main.45e78536.js.map"
+    "main.e4d9a9e1.js.map": "./static/js/main.e4d9a9e1.js.map"
   },
   "entrypoints": [
     "static/css/main.5ea97072.css",
-    "static/js/main.45e78536.js"
+    "static/js/main.e4d9a9e1.js"
   ]
 }

xinference/ui/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~45e78536~~.js"></script><link href="./static/css/main.5ea97072.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e4d9a9e1.js"></script><link href="./static/css/main.5ea97072.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 1.11.0__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.11.0py3-none-any.whl → 1.11.0.post1py3-none-any.whl