sglang 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +31 -13
- sglang/bench_server_latency.py +21 -10
- sglang/bench_serving.py +101 -7
- sglang/global_config.py +0 -1
- sglang/srt/conversation.py +11 -2
- sglang/srt/layers/attention/__init__.py +27 -5
- sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
- sglang/srt/layers/attention/flashinfer_backend.py +352 -83
- sglang/srt/layers/attention/triton_backend.py +6 -4
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
- sglang/srt/layers/sampler.py +6 -2
- sglang/srt/managers/data_parallel_controller.py +177 -0
- sglang/srt/managers/detokenizer_manager.py +31 -10
- sglang/srt/managers/io_struct.py +11 -2
- sglang/srt/managers/schedule_batch.py +126 -43
- sglang/srt/managers/schedule_policy.py +2 -1
- sglang/srt/managers/scheduler.py +245 -142
- sglang/srt/managers/tokenizer_manager.py +14 -1
- sglang/srt/managers/tp_worker.py +111 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -4
- sglang/srt/mem_cache/memory_pool.py +77 -4
- sglang/srt/mem_cache/radix_cache.py +15 -7
- sglang/srt/model_executor/cuda_graph_runner.py +4 -4
- sglang/srt/model_executor/forward_batch_info.py +16 -21
- sglang/srt/model_executor/model_runner.py +100 -36
- sglang/srt/models/baichuan.py +2 -3
- sglang/srt/models/chatglm.py +5 -6
- sglang/srt/models/commandr.py +1 -2
- sglang/srt/models/dbrx.py +1 -2
- sglang/srt/models/deepseek.py +4 -5
- sglang/srt/models/deepseek_v2.py +5 -6
- sglang/srt/models/exaone.py +1 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +5 -5
- sglang/srt/models/gpt_bigcode.py +5 -5
- sglang/srt/models/grok.py +1 -2
- sglang/srt/models/internlm2.py +1 -2
- sglang/srt/models/llama.py +1 -2
- sglang/srt/models/llama_classification.py +1 -2
- sglang/srt/models/llama_reward.py +2 -3
- sglang/srt/models/llava.py +4 -8
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +1 -2
- sglang/srt/models/minicpm3.py +5 -6
- sglang/srt/models/mixtral.py +1 -2
- sglang/srt/models/mixtral_quant.py +1 -2
- sglang/srt/models/olmo.py +352 -0
- sglang/srt/models/olmoe.py +1 -2
- sglang/srt/models/qwen.py +1 -2
- sglang/srt/models/qwen2.py +1 -2
- sglang/srt/models/qwen2_moe.py +4 -5
- sglang/srt/models/stablelm.py +1 -2
- sglang/srt/models/torch_native_llama.py +1 -2
- sglang/srt/models/xverse.py +1 -2
- sglang/srt/models/xverse_moe.py +4 -5
- sglang/srt/models/yivl.py +1 -2
- sglang/srt/openai_api/adapter.py +97 -52
- sglang/srt/openai_api/protocol.py +10 -2
- sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
- sglang/srt/sampling/sampling_batch_info.py +105 -59
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server.py +171 -37
- sglang/srt/server_args.py +127 -48
- sglang/srt/utils.py +37 -14
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/few_shot_gsm8k_engine.py +144 -0
- sglang/test/srt/sampling/penaltylib/utils.py +16 -12
- sglang/version.py +1 -1
- {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/METADATA +82 -32
- sglang-0.3.4.dist-info/RECORD +143 -0
- {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
- sglang/srt/layers/attention/flashinfer_utils.py +0 -237
- sglang-0.3.3.dist-info/RECORD +0 -139
- {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
- {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.4
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -219,36 +219,49 @@ Requires-Dist: sglang[srt]; extra == "all"
|
|
219
219
|
Requires-Dist: sglang[openai]; extra == "all"
|
220
220
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
221
221
|
Requires-Dist: sglang[litellm]; extra == "all"
|
222
|
+
Provides-Extra: all_xpu
|
223
|
+
Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
224
|
+
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
225
|
+
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
226
|
+
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
222
227
|
Provides-Extra: anthropic
|
223
228
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
224
229
|
Provides-Extra: dev
|
225
230
|
Requires-Dist: sglang[all]; extra == "dev"
|
226
231
|
Requires-Dist: sglang[test]; extra == "dev"
|
232
|
+
Provides-Extra: dev_xpu
|
233
|
+
Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
|
234
|
+
Requires-Dist: sglang[test]; extra == "dev-xpu"
|
227
235
|
Provides-Extra: litellm
|
228
236
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
229
237
|
Provides-Extra: openai
|
230
238
|
Requires-Dist: openai>=1.0; extra == "openai"
|
231
239
|
Requires-Dist: tiktoken; extra == "openai"
|
240
|
+
Provides-Extra: runtime_common
|
241
|
+
Requires-Dist: aiohttp; extra == "runtime-common"
|
242
|
+
Requires-Dist: decord; extra == "runtime-common"
|
243
|
+
Requires-Dist: fastapi; extra == "runtime-common"
|
244
|
+
Requires-Dist: hf-transfer; extra == "runtime-common"
|
245
|
+
Requires-Dist: huggingface-hub; extra == "runtime-common"
|
246
|
+
Requires-Dist: interegular; extra == "runtime-common"
|
247
|
+
Requires-Dist: orjson; extra == "runtime-common"
|
248
|
+
Requires-Dist: packaging; extra == "runtime-common"
|
249
|
+
Requires-Dist: pillow; extra == "runtime-common"
|
250
|
+
Requires-Dist: psutil; extra == "runtime-common"
|
251
|
+
Requires-Dist: pydantic; extra == "runtime-common"
|
252
|
+
Requires-Dist: python-multipart; extra == "runtime-common"
|
253
|
+
Requires-Dist: torchao; extra == "runtime-common"
|
254
|
+
Requires-Dist: uvicorn; extra == "runtime-common"
|
255
|
+
Requires-Dist: uvloop; extra == "runtime-common"
|
256
|
+
Requires-Dist: zmq; extra == "runtime-common"
|
257
|
+
Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
|
258
|
+
Requires-Dist: modelscope; extra == "runtime-common"
|
232
259
|
Provides-Extra: srt
|
233
|
-
Requires-Dist:
|
234
|
-
Requires-Dist: decord; extra == "srt"
|
235
|
-
Requires-Dist: fastapi; extra == "srt"
|
236
|
-
Requires-Dist: hf-transfer; extra == "srt"
|
237
|
-
Requires-Dist: huggingface-hub; extra == "srt"
|
238
|
-
Requires-Dist: interegular; extra == "srt"
|
239
|
-
Requires-Dist: packaging; extra == "srt"
|
240
|
-
Requires-Dist: pillow; extra == "srt"
|
241
|
-
Requires-Dist: psutil; extra == "srt"
|
242
|
-
Requires-Dist: pydantic; extra == "srt"
|
243
|
-
Requires-Dist: python-multipart; extra == "srt"
|
260
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
244
261
|
Requires-Dist: torch; extra == "srt"
|
245
|
-
Requires-Dist: torchao; extra == "srt"
|
246
|
-
Requires-Dist: uvicorn; extra == "srt"
|
247
|
-
Requires-Dist: uvloop; extra == "srt"
|
248
|
-
Requires-Dist: zmq; extra == "srt"
|
249
262
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
250
|
-
|
251
|
-
Requires-Dist:
|
263
|
+
Provides-Extra: srt_xpu
|
264
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
252
265
|
Provides-Extra: test
|
253
266
|
Requires-Dist: jsonlines; extra == "test"
|
254
267
|
Requires-Dist: matplotlib; extra == "test"
|
@@ -257,8 +270,8 @@ Requires-Dist: sentence-transformers; extra == "test"
|
|
257
270
|
Requires-Dist: accelerate; extra == "test"
|
258
271
|
Requires-Dist: peft; extra == "test"
|
259
272
|
|
260
|
-
<div align="center">
|
261
|
-
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
273
|
+
<div align="center" id="sglangtop">
|
274
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
262
275
|
|
263
276
|
[](https://pypi.org/project/sglang)
|
264
277
|

|
@@ -270,15 +283,13 @@ Requires-Dist: peft; extra == "test"
|
|
270
283
|
|
271
284
|
--------------------------------------------------------------------------------
|
272
285
|
|
273
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**
|
274
|
-
|
275
|
-
## Upcoming Events
|
276
|
-
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
|
-
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
286
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
287
|
+
[**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
278
288
|
|
279
289
|
## News
|
280
|
-
- [2024/
|
281
|
-
- [2024/
|
290
|
+
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
291
|
+
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
292
|
+
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
282
293
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
283
294
|
|
284
295
|
<details>
|
@@ -324,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
324
335
|
### Method 2: From source
|
325
336
|
```
|
326
337
|
# Use the last release branch
|
327
|
-
git clone -b v0.3.
|
338
|
+
git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
|
328
339
|
cd sglang
|
329
340
|
|
330
341
|
pip install --upgrade pip
|
@@ -501,6 +512,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
501
512
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
502
513
|
```
|
503
514
|
|
515
|
+
### Engine Without HTTP Server
|
516
|
+
|
517
|
+
We also provide an inference engine **without a HTTP server**. For example,
|
518
|
+
|
519
|
+
```python
|
520
|
+
import sglang as sgl
|
521
|
+
|
522
|
+
|
523
|
+
def main():
|
524
|
+
prompts = [
|
525
|
+
"Hello, my name is",
|
526
|
+
"The president of the United States is",
|
527
|
+
"The capital of France is",
|
528
|
+
"The future of AI is",
|
529
|
+
]
|
530
|
+
sampling_params = {"temperature": 0.8, "top_p": 0.95}
|
531
|
+
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
532
|
+
|
533
|
+
outputs = llm.generate(prompts, sampling_params)
|
534
|
+
for prompt, output in zip(prompts, outputs):
|
535
|
+
print("===============================")
|
536
|
+
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
537
|
+
|
538
|
+
if __name__ == "__main__":
|
539
|
+
main()
|
540
|
+
```
|
541
|
+
|
542
|
+
This can be used for:
|
543
|
+
|
544
|
+
1. **Offline Batch Inference**
|
545
|
+
2. **Building Custom Servers**
|
546
|
+
|
547
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
548
|
+
|
504
549
|
### Supported Models
|
505
550
|
|
506
551
|
**Generative Models**
|
@@ -837,10 +882,7 @@ def chat_example(s):
|
|
837
882
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
838
883
|
|
839
884
|
## Benchmark And Performance
|
840
|
-
|
841
|
-

|
842
|
-
|
843
|
-
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
885
|
+
Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
|
844
886
|
|
845
887
|
## Roadmap
|
846
888
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
@@ -848,3 +890,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
|
848
890
|
## Citation And Acknowledgment
|
849
891
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
850
892
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
893
|
+
|
894
|
+
|
895
|
+
|
896
|
+
<p align="center">
|
897
|
+
<a href="#sglangtop" target="_blank">
|
898
|
+
<bold>Back To Top </bold>
|
899
|
+
</a>
|
900
|
+
</p>
|
@@ -0,0 +1,143 @@
|
|
1
|
+
sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
|
2
|
+
sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
|
3
|
+
sglang/bench_latency.py,sha256=RWSyZ-UhLV6dyPMMtK3nSOoNsjCY5xMpYKeUKRNtdcA,18276
|
4
|
+
sglang/bench_server_latency.py,sha256=2AMPwU2_85q-Btz9UdZC-TnZJPgXcNkydvFYWn2CJlU,5892
|
5
|
+
sglang/bench_serving.py,sha256=jcxNP7reIJPh3x1hG5TCM6wMlDXshjyMJUUjL2O7kzs,40060
|
6
|
+
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
7
|
+
sglang/global_config.py,sha256=1r_W9rrBxGCCc2eqESRduOMMNq46e54xLgFLifHuQm0,1014
|
8
|
+
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
9
|
+
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
10
|
+
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
11
|
+
sglang/version.py,sha256=oYLGMpySamd16KLiaBTfRyrAS7_oyp-TOEHmzmeumwg,22
|
12
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
14
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
15
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
16
|
+
sglang/lang/interpreter.py,sha256=zakc6IkzATaMqVDWKWvqDRrqnRykxFawajA7aUHUDbI,30640
|
17
|
+
sglang/lang/ir.py,sha256=F_9ac10OjktxR7KhOV07wiJXV20s79cRfh9d4koExJc,18262
|
18
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
19
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
21
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
22
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
23
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
24
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
|
25
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
26
|
+
sglang/srt/conversation.py,sha256=B4QPGOUVdoXRJwWWxSm5pfifGpuBs07fDTxJ1BHUXLw,20003
|
27
|
+
sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
|
28
|
+
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
29
|
+
sglang/srt/server.py,sha256=65b39k4FN_TzL8qAimS1mRx8xdO8jmKCdUftOISUv7M,26809
|
30
|
+
sglang/srt/server_args.py,sha256=IDuX8ZCJd_6t2xHf7wNGskVgvpAQtUcFSGBnKFnLf3U,27290
|
31
|
+
sglang/srt/utils.py,sha256=0zalNeGrtrIyfmD7DHLRqocCY1_yNbPCD5hmionHpL0,23071
|
32
|
+
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
33
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
|
+
sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
|
35
|
+
sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
|
36
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
37
|
+
sglang/srt/constrained/fsm_cache.py,sha256=9GtliIN55Ov8Q9MSFfQC5rKrz3qTsB7Cm5OkhivKngY,3271
|
38
|
+
sglang/srt/constrained/jump_forward.py,sha256=o-CzJu3DEs0eFKlLzsQVYMSo4vBKpffs25sXLOJd6jc,6997
|
39
|
+
sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
|
40
|
+
sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
|
41
|
+
sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=Fq7VHwjP4iSzl_OBLo8qw_HVbIDbYB-0MGmfiD3Jk_E,12521
|
43
|
+
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
44
|
+
sglang/srt/layers/radix_attention.py,sha256=i07VRXPDHj-zJ1TSrXEqCxumQwYSHwAvc8DoIg-Irtg,1964
|
45
|
+
sglang/srt/layers/sampler.py,sha256=23wRDw2Fs3wZfPBh6gFBz2vqwxnoDd9LAHWq7YdQWlc,4166
|
46
|
+
sglang/srt/layers/torchao_utils.py,sha256=1nzZkSzbF4qCAMeBKAeeDpMl_mK8imiY2RL3xFEgvAw,3340
|
47
|
+
sglang/srt/layers/attention/__init__.py,sha256=hyrPpnuiTs5VIZNyoIjZLRsHUX20gX2dvY9kkqyXIII,2158
|
48
|
+
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=owzPwLWcShZ0ezkVjBr0vV73vtQIUh8z-rcQtXLG1fk,10374
|
49
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=y9saTqOhb_tJoRtjq9sishlQFGYlFkQD7QcV9x_ureo,19554
|
50
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=B6YuIZWh4Zn85Q57i0z3eZ08RCrS0rfyPJzkRr_zOIQ,6150
|
51
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
|
52
|
+
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
53
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=nEG7iBh1pAy3WaqPdLZwCJwDgyk5HLQ181kBS2nxbwg,11179
|
54
|
+
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=bNHHZeEowwI1wwOWj2T6bjBTBtVZUbcL-0cgfZwpHek,5471
|
55
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
56
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
57
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
|
58
|
+
sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
|
59
|
+
sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
|
60
|
+
sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
|
61
|
+
sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
|
62
|
+
sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
|
63
|
+
sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
|
64
|
+
sglang/srt/managers/data_parallel_controller.py,sha256=GJGfX1-5DoQFZ-EMh_p02nvrOtrOc0UebnULWHhFrss,5765
|
65
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=G2-Y-nDbq7LF8ZFWcXXcMkbCwzyBEh1g4UrciDlkNYY,7985
|
66
|
+
sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
|
67
|
+
sglang/srt/managers/io_struct.py,sha256=QJ1Eu7XNsRH35ec5nUOUS2XSjiFWyjYHKsnMqviC_Mk,12298
|
68
|
+
sglang/srt/managers/schedule_batch.py,sha256=6LUXbAd6PvHIDk6iwpsufosahIT2z32JZjsNx5xeKcg,33930
|
69
|
+
sglang/srt/managers/schedule_policy.py,sha256=unDmK7Y_Ti0Eiizh3_iEFMsC1KDqGMTqU8MlQgg-6qo,11951
|
70
|
+
sglang/srt/managers/scheduler.py,sha256=Y7R-VkLt8Az2jZGrGRuhG1g4UPO5y-7b9BaOknuC2aI,43019
|
71
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=SprHC0Bs8kvtnYboDPrH587uO_sdKHyp7tVBCdbEB9c,25066
|
72
|
+
sglang/srt/managers/tp_worker.py,sha256=nwkIXiasGA4w97pnMG32U1DN1RlLFkvETvl9q7SjGeY,8887
|
73
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
74
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
|
75
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
76
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=ihVZXlJ_Fvs1L2c2SZQaijUYSn9X6eyiFiG2NNRQS_M,9297
|
77
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=cS6G5uOW_0QICH30PXxatetka4wnELfhP4czHn8RDJE,10414
|
78
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=KgSBvoF2IyCGDFNXQyN7sV3E_S2NndeMQyaQZB97Pak,10499
|
79
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=rSazAtkWKyc2g2QILT2-AsUdaBt51hQBU1qcS2iw_Nw,5690
|
80
|
+
sglang/srt/model_executor/model_runner.py,sha256=hn-VeLABZL4y_GcJFl2y_C7x6ZS_Xw0tDyKzOxYMtVQ,25983
|
81
|
+
sglang/srt/models/baichuan.py,sha256=uV20fr7SqlATxoziXRtJyXFnZZoWTUy3qvQNFaEvw8M,15014
|
82
|
+
sglang/srt/models/chatglm.py,sha256=uep4Wy_2jwn_x6Cvagt5rs3JRY_AlcM-VXvRTCFC5Tc,13172
|
83
|
+
sglang/srt/models/commandr.py,sha256=WIMwjV3C0pRbVs4Xv9tqnHGreRvWC7zsML2hNuXw4A0,14060
|
84
|
+
sglang/srt/models/dbrx.py,sha256=_DshXyXr_xVB7wtE28PFcb6KDIkA6gygkXYKqivSCFc,14554
|
85
|
+
sglang/srt/models/deepseek.py,sha256=W342tVpEpkc_fvO_DTP4fX3EGF-DIFC4QOySdUGzl9w,15837
|
86
|
+
sglang/srt/models/deepseek_v2.py,sha256=5P5678aaMT4iO4jS0dZWUiRG4o9EE98xVgs6Zjy-Mr0,28229
|
87
|
+
sglang/srt/models/exaone.py,sha256=bIXdAXoWlCdfDdX2q47Br3QOa3jEYiiP2Hdd1T4crnM,12993
|
88
|
+
sglang/srt/models/gemma.py,sha256=4MVHwc5Jc4CSg3HIdNJEBYk8mhspjuwvc_6Oi8Cd-g8,12202
|
89
|
+
sglang/srt/models/gemma2.py,sha256=3VL223T_3syBG3fUInbtFaXvIs7dYjtsfX3OfDQc7m4,14777
|
90
|
+
sglang/srt/models/gpt_bigcode.py,sha256=q9N13Js2v0VheudWssRoSjnptS6TSf7DOmC8zLRGxeo,10049
|
91
|
+
sglang/srt/models/grok.py,sha256=vc7-E_hemNKaNORxg4rmaQcVYlpoavyaAZUG9B2dgbY,14835
|
92
|
+
sglang/srt/models/internlm2.py,sha256=-liQB13sgR3GnXJacBSMuEbLa2N4tICx0LsNgu_nNvU,12108
|
93
|
+
sglang/srt/models/llama.py,sha256=a43Y5mvMDmFcRcPL78vsAElaOvTqPajLPB2_BDwJ7pM,15767
|
94
|
+
sglang/srt/models/llama_classification.py,sha256=WcHYFez7qloTCpXLy1A6-dBGHWp22ebv6yG68jFVBjc,3318
|
95
|
+
sglang/srt/models/llama_embedding.py,sha256=4j3WNLB-x7XQnJvohdRs7VSSEabbhiE2BRHmnG5IZRU,3453
|
96
|
+
sglang/srt/models/llama_reward.py,sha256=ag3eVdP38iURj81fTCa-sC2jV_eCkTIjXUQf1I96fCI,5297
|
97
|
+
sglang/srt/models/llava.py,sha256=ny3sK2sgYwrEhawSAc1tZeltcgukphSTdxsqyq-Epkc,24857
|
98
|
+
sglang/srt/models/llavavid.py,sha256=ztS5He-NF4fmfujdoMnKljOG1fNfPvp-6bduT7B6EMU,12137
|
99
|
+
sglang/srt/models/minicpm.py,sha256=LpUdxKA27z79DSYAPPlfCgI4GEnWCYznhgSQl-QCsTY,13731
|
100
|
+
sglang/srt/models/minicpm3.py,sha256=-fLZ-RRbR2jLGSsatBWV-qsSNIZCPbS_jasmrOlUdK8,25023
|
101
|
+
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
102
|
+
sglang/srt/models/mixtral.py,sha256=UUqzpOBXsObirmpJz4xstlG82uu4JfXsh-gWQmiKbW0,13803
|
103
|
+
sglang/srt/models/mixtral_quant.py,sha256=HPipVG_Gc5Ki0YXg49Rwn2_uvtCCI1IxlA7mVRVFivw,13978
|
104
|
+
sglang/srt/models/olmo.py,sha256=lD4VewXK0rVqhttGkOOzaxoqRQgVfV90s8ElStPBBdE,11896
|
105
|
+
sglang/srt/models/olmoe.py,sha256=3qHnY1DWBhyx9FWGJGb3a8kewcmEdYZOkYZ1JBx1LWs,15251
|
106
|
+
sglang/srt/models/qwen.py,sha256=mjGqo3NkTYfJ2qqztFw8mjKggPT2moW15nQgrq3GxWk,9860
|
107
|
+
sglang/srt/models/qwen2.py,sha256=I2ZzH9pVTZdjP1fHlq1qdG4JiWHt1CC6t1EK2gN5Ppc,12337
|
108
|
+
sglang/srt/models/qwen2_moe.py,sha256=BaNq8xgZKqjr_fcEBtH4yjBSc3-p4VztPiknVwllcQk,16984
|
109
|
+
sglang/srt/models/stablelm.py,sha256=0NWUVsYGhbc_X2eT9x38MaaUhZGmFtMgw_2PBv25Yxw,11265
|
110
|
+
sglang/srt/models/torch_native_llama.py,sha256=dtasdhwfRPE1eOcAIFUBsHrDnkjegXvo8WhGlqvXGKk,19154
|
111
|
+
sglang/srt/models/xverse.py,sha256=v4OaFdss9oD5YNzXsnjoXE9ffCkXL9U5o0OWLm1vHQQ,13573
|
112
|
+
sglang/srt/models/xverse_moe.py,sha256=A8EB82NpozoBplp7Qd8B_kY_3cL-UMydAxYIrhACVPE,15682
|
113
|
+
sglang/srt/models/yivl.py,sha256=xcWqkuZ29FmBBJY6aKetwItWIPl-kfXK-QmgdLONles,4765
|
114
|
+
sglang/srt/openai_api/adapter.py,sha256=WkYCKVaYTkFdLrySBhlkDyHJVaaHMF7KrhNnmw3L3us,53534
|
115
|
+
sglang/srt/openai_api/protocol.py,sha256=EZ6G209rBEDP7cepO2kAYqE8wMe1ksYdN7to1iT97Lw,10248
|
116
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=EAdep3I5qmbDDQJ0Ktrq0ySXJ6DCrTAjniEwFu4ZRqE,7679
|
117
|
+
sglang/srt/sampling/sampling_params.py,sha256=ZPHCQq7Bi4P_sxUzdKgYVXZpB_tC-kA7rlLwiW9Ct9A,5781
|
118
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
119
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=kizcPnxtRawmDt6utRuhbk4yfNs5H5mx1DAlDVEZRv8,11328
|
120
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
121
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
122
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
123
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
124
|
+
sglang/test/few_shot_gsm8k.py,sha256=ll-gNbcv829IwSPXAZt4JIEIu8IR3APCLcX3BHOFVp8,3968
|
125
|
+
sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
|
126
|
+
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
127
|
+
sglang/test/runners.py,sha256=VCmtH08FsAq_JTAKfKo0zB4o-osNMAxxwe4aKcSxr4c,13515
|
128
|
+
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
129
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
130
|
+
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
131
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
132
|
+
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
133
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
134
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
135
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
136
|
+
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
137
|
+
sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
|
138
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=Koe8GYoxIBUCz71of0oHhM5t5QcEd6a1IYq5SszRFAw,12730
|
139
|
+
sglang-0.3.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
140
|
+
sglang-0.3.4.dist-info/METADATA,sha256=rrkwX2teVdp79NEuOJfTBPUYCs_72LHIabuIesToPdI,40738
|
141
|
+
sglang-0.3.4.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
142
|
+
sglang-0.3.4.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
143
|
+
sglang-0.3.4.dist-info/RECORD,,
|
@@ -1,237 +0,0 @@
|
|
1
|
-
from enum import Enum, auto
|
2
|
-
|
3
|
-
import torch
|
4
|
-
import triton
|
5
|
-
import triton.language as tl
|
6
|
-
|
7
|
-
|
8
|
-
class WrapperDispatch(Enum):
|
9
|
-
SLIDING_WINDOW = auto()
|
10
|
-
CROSS_ATTENTION = auto()
|
11
|
-
|
12
|
-
|
13
|
-
@triton.jit
|
14
|
-
def create_flashinfer_kv_indices_triton(
|
15
|
-
req_to_token_ptr, # [max_batch, max_context_len]
|
16
|
-
req_pool_indices_ptr,
|
17
|
-
page_kernel_lens_ptr,
|
18
|
-
kv_indptr,
|
19
|
-
kv_start_idx,
|
20
|
-
kv_indices_ptr,
|
21
|
-
max_context_len: tl.constexpr,
|
22
|
-
):
|
23
|
-
BLOCK_SIZE: tl.constexpr = 512
|
24
|
-
pid = tl.program_id(axis=0)
|
25
|
-
req_pool_index = tl.load(req_pool_indices_ptr + pid)
|
26
|
-
kv_indices_offset = tl.load(kv_indptr + pid)
|
27
|
-
|
28
|
-
kv_start = 0
|
29
|
-
kv_end = 0
|
30
|
-
if kv_start_idx:
|
31
|
-
kv_start = tl.load(kv_start_idx + pid).to(tl.int32)
|
32
|
-
kv_end = kv_start
|
33
|
-
kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
|
34
|
-
|
35
|
-
req_to_token_ptr += req_pool_index * max_context_len
|
36
|
-
kv_indices_ptr += kv_indices_offset
|
37
|
-
|
38
|
-
ld_offset = kv_start + tl.arange(0, BLOCK_SIZE)
|
39
|
-
st_offset = tl.arange(0, BLOCK_SIZE)
|
40
|
-
num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
|
41
|
-
for _ in range(num_loop):
|
42
|
-
mask = ld_offset < kv_end
|
43
|
-
data = tl.load(req_to_token_ptr + ld_offset, mask=mask)
|
44
|
-
tl.store(kv_indices_ptr + st_offset, data, mask=mask)
|
45
|
-
ld_offset += BLOCK_SIZE
|
46
|
-
st_offset += BLOCK_SIZE
|
47
|
-
|
48
|
-
|
49
|
-
class FlashinferUpdater:
|
50
|
-
def __init__(
|
51
|
-
self,
|
52
|
-
forward_mode,
|
53
|
-
model_runner,
|
54
|
-
req_pool_indices,
|
55
|
-
seq_lens,
|
56
|
-
prefix_lens,
|
57
|
-
decode_wrappers=None,
|
58
|
-
use_ragged=False,
|
59
|
-
):
|
60
|
-
self.forward_mode = forward_mode
|
61
|
-
self.model_runner = model_runner
|
62
|
-
self.req_pool_indices = req_pool_indices
|
63
|
-
self.seq_lens = seq_lens
|
64
|
-
self.prefix_lens = prefix_lens
|
65
|
-
self.use_ragged = use_ragged
|
66
|
-
|
67
|
-
self.num_qo_heads = (
|
68
|
-
model_runner.model_config.num_attention_heads // model_runner.tp_size
|
69
|
-
)
|
70
|
-
self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
|
71
|
-
model_runner.tp_size
|
72
|
-
)
|
73
|
-
self.head_dim = model_runner.model_config.head_dim
|
74
|
-
self.batch_size = len(req_pool_indices)
|
75
|
-
|
76
|
-
self.decode_wrappers = (
|
77
|
-
decode_wrappers or self.model_runner.attn_backend.decode_wrappers
|
78
|
-
)
|
79
|
-
self.prefill_wrapper_ragged = (
|
80
|
-
self.model_runner.attn_backend.prefill_wrapper_ragged
|
81
|
-
)
|
82
|
-
self.prefill_wrappers_paged = (
|
83
|
-
self.model_runner.attn_backend.prefill_wrappers_paged
|
84
|
-
)
|
85
|
-
|
86
|
-
self.kv_last_page_len = torch.ones(
|
87
|
-
(self.batch_size,), dtype=torch.int32, device="cuda"
|
88
|
-
)
|
89
|
-
|
90
|
-
def _update_decode_indices(self, decode_wrapper):
|
91
|
-
assert not isinstance(decode_wrapper, list)
|
92
|
-
decode_wrapper.end_forward()
|
93
|
-
decode_wrapper.begin_forward(
|
94
|
-
self.kv_indptr,
|
95
|
-
self.kv_indices,
|
96
|
-
self.kv_last_page_len,
|
97
|
-
self.num_qo_heads,
|
98
|
-
self.num_kv_heads,
|
99
|
-
self.head_dim,
|
100
|
-
1,
|
101
|
-
data_type=self.model_runner.kv_cache_dtype,
|
102
|
-
q_data_type=self.model_runner.dtype,
|
103
|
-
)
|
104
|
-
|
105
|
-
def _update_extend_indices(self, ragged_wrapper, paged_wrapper):
|
106
|
-
assert not isinstance(paged_wrapper, list)
|
107
|
-
assert not isinstance(ragged_wrapper, list)
|
108
|
-
|
109
|
-
# extend part
|
110
|
-
qo_indptr = torch.zeros(
|
111
|
-
(self.batch_size + 1,), dtype=torch.int32, device="cuda"
|
112
|
-
)
|
113
|
-
qo_indptr[1:] = torch.cumsum(self.seq_lens - self.prefix_lens, dim=0)
|
114
|
-
|
115
|
-
if self.use_ragged:
|
116
|
-
ragged_wrapper.end_forward()
|
117
|
-
ragged_wrapper.begin_forward(
|
118
|
-
qo_indptr,
|
119
|
-
qo_indptr,
|
120
|
-
self.num_qo_heads,
|
121
|
-
self.num_kv_heads,
|
122
|
-
self.head_dim,
|
123
|
-
)
|
124
|
-
|
125
|
-
# cached part
|
126
|
-
paged_wrapper.end_forward()
|
127
|
-
paged_wrapper.begin_forward(
|
128
|
-
qo_indptr,
|
129
|
-
self.kv_indptr,
|
130
|
-
self.kv_indices,
|
131
|
-
self.kv_last_page_len,
|
132
|
-
self.num_qo_heads,
|
133
|
-
self.num_kv_heads,
|
134
|
-
self.head_dim,
|
135
|
-
1,
|
136
|
-
)
|
137
|
-
|
138
|
-
def _get_indices(self, dispatch_reason: WrapperDispatch = None, wrapper_id=0):
|
139
|
-
if dispatch_reason is None:
|
140
|
-
if self.use_ragged:
|
141
|
-
paged_kernel_lens = self.prefix_lens
|
142
|
-
else:
|
143
|
-
paged_kernel_lens = self.seq_lens
|
144
|
-
self.kv_start_idx = None
|
145
|
-
elif dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
|
146
|
-
if wrapper_id == 0:
|
147
|
-
# window attention use paged only
|
148
|
-
if self.forward_mode.is_decode():
|
149
|
-
paged_kernel_lens = torch.minimum(
|
150
|
-
self.seq_lens,
|
151
|
-
torch.tensor(self.model_runner.sliding_window_size + 1),
|
152
|
-
)
|
153
|
-
else:
|
154
|
-
paged_kernel_lens = torch.minimum(
|
155
|
-
self.seq_lens,
|
156
|
-
torch.tensor(self.model_runner.sliding_window_size)
|
157
|
-
+ self.seq_lens
|
158
|
-
- self.prefix_lens,
|
159
|
-
)
|
160
|
-
else:
|
161
|
-
# full attention
|
162
|
-
paged_kernel_lens = self.seq_lens
|
163
|
-
self.kv_start_idx = self.seq_lens - paged_kernel_lens
|
164
|
-
|
165
|
-
self.kv_indptr = torch.zeros(
|
166
|
-
(self.batch_size + 1,), dtype=torch.int32, device="cuda"
|
167
|
-
)
|
168
|
-
self.kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
|
169
|
-
self.kv_indices = torch.empty(
|
170
|
-
self.kv_indptr[-1], dtype=torch.int32, device="cuda"
|
171
|
-
)
|
172
|
-
|
173
|
-
create_flashinfer_kv_indices_triton[(self.batch_size,)](
|
174
|
-
self.model_runner.req_to_token_pool.req_to_token,
|
175
|
-
self.req_pool_indices,
|
176
|
-
paged_kernel_lens,
|
177
|
-
self.kv_indptr,
|
178
|
-
self.kv_start_idx,
|
179
|
-
self.kv_indices,
|
180
|
-
self.model_runner.req_to_token_pool.req_to_token.size(1),
|
181
|
-
)
|
182
|
-
|
183
|
-
def _update_indicess_single_wrapper(self):
|
184
|
-
self._get_indices()
|
185
|
-
|
186
|
-
if self.forward_mode.is_decode():
|
187
|
-
self._update_decode_indices(self.decode_wrappers[0])
|
188
|
-
else:
|
189
|
-
self._update_extend_indices(
|
190
|
-
self.prefill_wrapper_ragged,
|
191
|
-
self.prefill_wrappers_paged[0],
|
192
|
-
)
|
193
|
-
|
194
|
-
def _update_indices_cross_attention(self):
|
195
|
-
pass
|
196
|
-
|
197
|
-
def _update_indices_sliding_window(self):
|
198
|
-
assert self.use_ragged is False
|
199
|
-
for wrapper_id in range(2):
|
200
|
-
self._get_indices(WrapperDispatch.SLIDING_WINDOW, wrapper_id)
|
201
|
-
if self.forward_mode.is_decode():
|
202
|
-
self._update_decode_indices(self.decode_wrappers[wrapper_id])
|
203
|
-
else:
|
204
|
-
self._update_extend_indices(
|
205
|
-
None,
|
206
|
-
self.prefill_wrappers_paged[wrapper_id],
|
207
|
-
)
|
208
|
-
|
209
|
-
|
210
|
-
def update_flashinfer_indices(
|
211
|
-
forward_mode,
|
212
|
-
model_runner,
|
213
|
-
req_pool_indices,
|
214
|
-
seq_lens,
|
215
|
-
prefix_lens,
|
216
|
-
decode_wrappers=None,
|
217
|
-
use_ragged=False,
|
218
|
-
):
|
219
|
-
updater = FlashinferUpdater(
|
220
|
-
forward_mode,
|
221
|
-
model_runner,
|
222
|
-
req_pool_indices,
|
223
|
-
seq_lens,
|
224
|
-
prefix_lens,
|
225
|
-
decode_wrappers,
|
226
|
-
use_ragged,
|
227
|
-
)
|
228
|
-
|
229
|
-
dispatch_reason = model_runner.attn_backend.dispatch_reason
|
230
|
-
|
231
|
-
if dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
|
232
|
-
updater._update_indices_sliding_window()
|
233
|
-
elif dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
|
234
|
-
updater._update_indices_cross_attention()
|
235
|
-
else:
|
236
|
-
assert model_runner.attn_backend.num_wrappers == 1
|
237
|
-
updater._update_indicess_single_wrapper()
|