xinference 1.11.0__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-19T20:53:12+0800",
11
+ "date": "2025-10-20T18:17:30+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "baaa40b463e4948762b078f5995d67775df53704",
15
- "version": "1.11.0"
14
+ "full-revisionid": "378b99185de5a7623f75798df7e4391f4ff39e35",
15
+ "version": "1.11.0.post1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -549,46 +549,30 @@ class PytorchModel(LLM):
549
549
  So we need pad `0` on the left again.
550
550
  """
551
551
  data = []
552
- # For decode phase, attention mask should match the full KV cache sequence length
553
- # All requests in batch should have attention mask of length `seq_length`
554
- for r in reqs:
555
- # Get the actual sequence length for this request from its tracking
556
- if "attention_mask_seq_len" not in r.extra_kwargs:
557
- # Initialize with the current sequence length (full KV cache length)
558
- r.extra_kwargs["attention_mask_seq_len"] = seq_length
559
- else:
560
- # Use the previously tracked length, but ensure it doesn't exceed current seq_length
561
- tracked_len = r.extra_kwargs["attention_mask_seq_len"]
562
- r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
563
-
564
- # For decode phase after KV cache merge, all requests should have attention mask
565
- # that matches the merged sequence length
552
+ max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
566
553
  for r in reqs:
554
+ r.extra_kwargs["attention_mask_seq_len"] += 1
567
555
  real_len = r.extra_kwargs["attention_mask_seq_len"]
556
+ pad_len = max_len - real_len
568
557
 
569
- # The attention mask should cover the full sequence length
570
- if real_len < seq_length:
571
- # Pad with zeros on the left to reach full sequence length
572
- pad_len = seq_length - real_len
573
-
574
- if self._tokenizer.padding_side == "left":
575
- x = torch.cat(
576
- [
577
- torch.full((pad_len,), 0, dtype=torch.long),
578
- torch.ones((real_len,), dtype=torch.long),
579
- ]
580
- )
581
- else:
582
- x = torch.cat(
583
- [
584
- torch.ones((real_len,), dtype=torch.long),
585
- torch.full((pad_len,), 0, dtype=torch.long),
586
- ]
587
- )
558
+ if self._tokenizer.padding_side == "left":
559
+ x = torch.cat(
560
+ [
561
+ (
562
+ torch.full((pad_len,), 0, dtype=torch.long)
563
+ if pad_len > 0
564
+ else torch.tensor([], dtype=torch.long)
565
+ ),
566
+ torch.ones((real_len,), dtype=torch.long),
567
+ ]
568
+ )
588
569
  else:
589
- # Already at correct length
590
- x = torch.ones((real_len,), dtype=torch.long)
591
-
570
+ x = torch.cat(
571
+ [
572
+ torch.ones((real_len,), dtype=torch.long),
573
+ torch.full((pad_len,), 0, dtype=torch.long),
574
+ ]
575
+ )
592
576
  data.append(x)
593
577
 
594
578
  return torch.stack(data).to(self._device)
@@ -285,30 +285,10 @@ def _batch_inference_one_step_internal(
285
285
  # This prevents batch size mismatches during merging
286
286
  decode_kv = decode_reqs[0].kv_cache
287
287
 
288
- # Verify that all decode requests share the same kv_cache
289
- for req in decode_reqs[1:]:
290
- if req.kv_cache is not decode_kv:
291
- logger.warning(
292
- "Inconsistent kv_cache references detected in decode requests. "
293
- "This may indicate a batching synchronization issue."
294
- )
295
- # Use the first decode_kv as the reference to maintain consistency
296
- req.kv_cache = decode_kv
297
-
298
288
  # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
299
289
  merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
300
- # Update sequence length information after KV cache merge
301
- _, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
302
- merged_kv_cache, xinf_model_obj
303
- )
304
290
  for r in valid_req_list:
305
291
  r.kv_cache = merged_kv_cache
306
- # Update attention mask sequence length to match merged KV cache
307
- if "attention_mask_seq_len" in r.extra_kwargs:
308
- # Ensure the attention mask length doesn't exceed the merged sequence length
309
- r.extra_kwargs["attention_mask_seq_len"] = min(
310
- r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
311
- )
312
292
  empty_cache()
313
293
  else:
314
294
  for r in valid_req_list:
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
3
  "main.css": "./static/css/main.5ea97072.css",
4
- "main.js": "./static/js/main.45e78536.js",
4
+ "main.js": "./static/js/main.e4d9a9e1.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
7
  "main.5ea97072.css.map": "./static/css/main.5ea97072.css.map",
8
- "main.45e78536.js.map": "./static/js/main.45e78536.js.map"
8
+ "main.e4d9a9e1.js.map": "./static/js/main.e4d9a9e1.js.map"
9
9
  },
10
10
  "entrypoints": [
11
11
  "static/css/main.5ea97072.css",
12
- "static/js/main.45e78536.js"
12
+ "static/js/main.e4d9a9e1.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.45e78536.js"></script><link href="./static/css/main.5ea97072.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e4d9a9e1.js"></script><link href="./static/css/main.5ea97072.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>