xinference 1.11.0__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/model/llm/transformers/core.py +20 -36
- xinference/model/llm/transformers/utils.py +0 -20
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/{main.45e78536.js → main.e4d9a9e1.js} +3 -3
- xinference/ui/web/ui/build/static/js/main.e4d9a9e1.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e6770a05771952175c9fbf48fce283c9bb1bc8b5763e39edc36d099d1fe16b4a.json +1 -0
- {xinference-1.11.0.dist-info → xinference-1.11.0.post1.dist-info}/METADATA +2 -1
- {xinference-1.11.0.dist-info → xinference-1.11.0.post1.dist-info}/RECORD +15 -15
- xinference/ui/web/ui/build/static/js/main.45e78536.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.45e78536.js.LICENSE.txt → main.e4d9a9e1.js.LICENSE.txt} +0 -0
- {xinference-1.11.0.dist-info → xinference-1.11.0.post1.dist-info}/WHEEL +0 -0
- {xinference-1.11.0.dist-info → xinference-1.11.0.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.11.0.dist-info → xinference-1.11.0.post1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.11.0.dist-info → xinference-1.11.0.post1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-20T18:17:30+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.11.0"
|
|
14
|
+
"full-revisionid": "378b99185de5a7623f75798df7e4391f4ff39e35",
|
|
15
|
+
"version": "1.11.0.post1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -549,46 +549,30 @@ class PytorchModel(LLM):
|
|
|
549
549
|
So we need pad `0` on the left again.
|
|
550
550
|
"""
|
|
551
551
|
data = []
|
|
552
|
-
|
|
553
|
-
# All requests in batch should have attention mask of length `seq_length`
|
|
554
|
-
for r in reqs:
|
|
555
|
-
# Get the actual sequence length for this request from its tracking
|
|
556
|
-
if "attention_mask_seq_len" not in r.extra_kwargs:
|
|
557
|
-
# Initialize with the current sequence length (full KV cache length)
|
|
558
|
-
r.extra_kwargs["attention_mask_seq_len"] = seq_length
|
|
559
|
-
else:
|
|
560
|
-
# Use the previously tracked length, but ensure it doesn't exceed current seq_length
|
|
561
|
-
tracked_len = r.extra_kwargs["attention_mask_seq_len"]
|
|
562
|
-
r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
|
|
563
|
-
|
|
564
|
-
# For decode phase after KV cache merge, all requests should have attention mask
|
|
565
|
-
# that matches the merged sequence length
|
|
552
|
+
max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
|
|
566
553
|
for r in reqs:
|
|
554
|
+
r.extra_kwargs["attention_mask_seq_len"] += 1
|
|
567
555
|
real_len = r.extra_kwargs["attention_mask_seq_len"]
|
|
556
|
+
pad_len = max_len - real_len
|
|
568
557
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
)
|
|
581
|
-
else:
|
|
582
|
-
x = torch.cat(
|
|
583
|
-
[
|
|
584
|
-
torch.ones((real_len,), dtype=torch.long),
|
|
585
|
-
torch.full((pad_len,), 0, dtype=torch.long),
|
|
586
|
-
]
|
|
587
|
-
)
|
|
558
|
+
if self._tokenizer.padding_side == "left":
|
|
559
|
+
x = torch.cat(
|
|
560
|
+
[
|
|
561
|
+
(
|
|
562
|
+
torch.full((pad_len,), 0, dtype=torch.long)
|
|
563
|
+
if pad_len > 0
|
|
564
|
+
else torch.tensor([], dtype=torch.long)
|
|
565
|
+
),
|
|
566
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
567
|
+
]
|
|
568
|
+
)
|
|
588
569
|
else:
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
570
|
+
x = torch.cat(
|
|
571
|
+
[
|
|
572
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
573
|
+
torch.full((pad_len,), 0, dtype=torch.long),
|
|
574
|
+
]
|
|
575
|
+
)
|
|
592
576
|
data.append(x)
|
|
593
577
|
|
|
594
578
|
return torch.stack(data).to(self._device)
|
|
@@ -285,30 +285,10 @@ def _batch_inference_one_step_internal(
|
|
|
285
285
|
# This prevents batch size mismatches during merging
|
|
286
286
|
decode_kv = decode_reqs[0].kv_cache
|
|
287
287
|
|
|
288
|
-
# Verify that all decode requests share the same kv_cache
|
|
289
|
-
for req in decode_reqs[1:]:
|
|
290
|
-
if req.kv_cache is not decode_kv:
|
|
291
|
-
logger.warning(
|
|
292
|
-
"Inconsistent kv_cache references detected in decode requests. "
|
|
293
|
-
"This may indicate a batching synchronization issue."
|
|
294
|
-
)
|
|
295
|
-
# Use the first decode_kv as the reference to maintain consistency
|
|
296
|
-
req.kv_cache = decode_kv
|
|
297
|
-
|
|
298
288
|
# prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
|
|
299
289
|
merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
|
|
300
|
-
# Update sequence length information after KV cache merge
|
|
301
|
-
_, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
|
|
302
|
-
merged_kv_cache, xinf_model_obj
|
|
303
|
-
)
|
|
304
290
|
for r in valid_req_list:
|
|
305
291
|
r.kv_cache = merged_kv_cache
|
|
306
|
-
# Update attention mask sequence length to match merged KV cache
|
|
307
|
-
if "attention_mask_seq_len" in r.extra_kwargs:
|
|
308
|
-
# Ensure the attention mask length doesn't exceed the merged sequence length
|
|
309
|
-
r.extra_kwargs["attention_mask_seq_len"] = min(
|
|
310
|
-
r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
|
|
311
|
-
)
|
|
312
292
|
empty_cache()
|
|
313
293
|
else:
|
|
314
294
|
for r in valid_req_list:
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.5ea97072.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.e4d9a9e1.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.5ea97072.css.map": "./static/css/main.5ea97072.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.e4d9a9e1.js.map": "./static/js/main.e4d9a9e1.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.5ea97072.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.e4d9a9e1.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e4d9a9e1.js"></script><link href="./static/css/main.5ea97072.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|