sglang 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. sglang/bench_latency.py +31 -13
  2. sglang/bench_server_latency.py +21 -10
  3. sglang/bench_serving.py +101 -7
  4. sglang/global_config.py +0 -1
  5. sglang/srt/conversation.py +11 -2
  6. sglang/srt/layers/attention/__init__.py +27 -5
  7. sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
  8. sglang/srt/layers/attention/flashinfer_backend.py +352 -83
  9. sglang/srt/layers/attention/triton_backend.py +6 -4
  10. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
  11. sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
  12. sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
  13. sglang/srt/layers/sampler.py +6 -2
  14. sglang/srt/managers/data_parallel_controller.py +177 -0
  15. sglang/srt/managers/detokenizer_manager.py +31 -10
  16. sglang/srt/managers/io_struct.py +11 -2
  17. sglang/srt/managers/schedule_batch.py +126 -43
  18. sglang/srt/managers/schedule_policy.py +2 -1
  19. sglang/srt/managers/scheduler.py +245 -142
  20. sglang/srt/managers/tokenizer_manager.py +14 -1
  21. sglang/srt/managers/tp_worker.py +111 -1
  22. sglang/srt/mem_cache/chunk_cache.py +8 -4
  23. sglang/srt/mem_cache/memory_pool.py +77 -4
  24. sglang/srt/mem_cache/radix_cache.py +15 -7
  25. sglang/srt/model_executor/cuda_graph_runner.py +4 -4
  26. sglang/srt/model_executor/forward_batch_info.py +16 -21
  27. sglang/srt/model_executor/model_runner.py +100 -36
  28. sglang/srt/models/baichuan.py +2 -3
  29. sglang/srt/models/chatglm.py +5 -6
  30. sglang/srt/models/commandr.py +1 -2
  31. sglang/srt/models/dbrx.py +1 -2
  32. sglang/srt/models/deepseek.py +4 -5
  33. sglang/srt/models/deepseek_v2.py +5 -6
  34. sglang/srt/models/exaone.py +1 -2
  35. sglang/srt/models/gemma.py +2 -2
  36. sglang/srt/models/gemma2.py +5 -5
  37. sglang/srt/models/gpt_bigcode.py +5 -5
  38. sglang/srt/models/grok.py +1 -2
  39. sglang/srt/models/internlm2.py +1 -2
  40. sglang/srt/models/llama.py +1 -2
  41. sglang/srt/models/llama_classification.py +1 -2
  42. sglang/srt/models/llama_reward.py +2 -3
  43. sglang/srt/models/llava.py +4 -8
  44. sglang/srt/models/llavavid.py +1 -2
  45. sglang/srt/models/minicpm.py +1 -2
  46. sglang/srt/models/minicpm3.py +5 -6
  47. sglang/srt/models/mixtral.py +1 -2
  48. sglang/srt/models/mixtral_quant.py +1 -2
  49. sglang/srt/models/olmo.py +352 -0
  50. sglang/srt/models/olmoe.py +1 -2
  51. sglang/srt/models/qwen.py +1 -2
  52. sglang/srt/models/qwen2.py +1 -2
  53. sglang/srt/models/qwen2_moe.py +4 -5
  54. sglang/srt/models/stablelm.py +1 -2
  55. sglang/srt/models/torch_native_llama.py +1 -2
  56. sglang/srt/models/xverse.py +1 -2
  57. sglang/srt/models/xverse_moe.py +4 -5
  58. sglang/srt/models/yivl.py +1 -2
  59. sglang/srt/openai_api/adapter.py +97 -52
  60. sglang/srt/openai_api/protocol.py +10 -2
  61. sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
  62. sglang/srt/sampling/sampling_batch_info.py +105 -59
  63. sglang/srt/sampling/sampling_params.py +2 -0
  64. sglang/srt/server.py +171 -37
  65. sglang/srt/server_args.py +127 -48
  66. sglang/srt/utils.py +37 -14
  67. sglang/test/few_shot_gsm8k.py +4 -1
  68. sglang/test/few_shot_gsm8k_engine.py +144 -0
  69. sglang/test/srt/sampling/penaltylib/utils.py +16 -12
  70. sglang/version.py +1 -1
  71. {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/METADATA +82 -32
  72. sglang-0.3.4.dist-info/RECORD +143 -0
  73. {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
  74. sglang/srt/layers/attention/flashinfer_utils.py +0 -237
  75. sglang-0.3.3.dist-info/RECORD +0 -139
  76. {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
  77. {sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -219,36 +219,49 @@ Requires-Dist: sglang[srt]; extra == "all"
219
219
  Requires-Dist: sglang[openai]; extra == "all"
220
220
  Requires-Dist: sglang[anthropic]; extra == "all"
221
221
  Requires-Dist: sglang[litellm]; extra == "all"
222
+ Provides-Extra: all_xpu
223
+ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
224
+ Requires-Dist: sglang[openai]; extra == "all-xpu"
225
+ Requires-Dist: sglang[anthropic]; extra == "all-xpu"
226
+ Requires-Dist: sglang[litellm]; extra == "all-xpu"
222
227
  Provides-Extra: anthropic
223
228
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
224
229
  Provides-Extra: dev
225
230
  Requires-Dist: sglang[all]; extra == "dev"
226
231
  Requires-Dist: sglang[test]; extra == "dev"
232
+ Provides-Extra: dev_xpu
233
+ Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
234
+ Requires-Dist: sglang[test]; extra == "dev-xpu"
227
235
  Provides-Extra: litellm
228
236
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
229
237
  Provides-Extra: openai
230
238
  Requires-Dist: openai>=1.0; extra == "openai"
231
239
  Requires-Dist: tiktoken; extra == "openai"
240
+ Provides-Extra: runtime_common
241
+ Requires-Dist: aiohttp; extra == "runtime-common"
242
+ Requires-Dist: decord; extra == "runtime-common"
243
+ Requires-Dist: fastapi; extra == "runtime-common"
244
+ Requires-Dist: hf-transfer; extra == "runtime-common"
245
+ Requires-Dist: huggingface-hub; extra == "runtime-common"
246
+ Requires-Dist: interegular; extra == "runtime-common"
247
+ Requires-Dist: orjson; extra == "runtime-common"
248
+ Requires-Dist: packaging; extra == "runtime-common"
249
+ Requires-Dist: pillow; extra == "runtime-common"
250
+ Requires-Dist: psutil; extra == "runtime-common"
251
+ Requires-Dist: pydantic; extra == "runtime-common"
252
+ Requires-Dist: python-multipart; extra == "runtime-common"
253
+ Requires-Dist: torchao; extra == "runtime-common"
254
+ Requires-Dist: uvicorn; extra == "runtime-common"
255
+ Requires-Dist: uvloop; extra == "runtime-common"
256
+ Requires-Dist: zmq; extra == "runtime-common"
257
+ Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
258
+ Requires-Dist: modelscope; extra == "runtime-common"
232
259
  Provides-Extra: srt
233
- Requires-Dist: aiohttp; extra == "srt"
234
- Requires-Dist: decord; extra == "srt"
235
- Requires-Dist: fastapi; extra == "srt"
236
- Requires-Dist: hf-transfer; extra == "srt"
237
- Requires-Dist: huggingface-hub; extra == "srt"
238
- Requires-Dist: interegular; extra == "srt"
239
- Requires-Dist: packaging; extra == "srt"
240
- Requires-Dist: pillow; extra == "srt"
241
- Requires-Dist: psutil; extra == "srt"
242
- Requires-Dist: pydantic; extra == "srt"
243
- Requires-Dist: python-multipart; extra == "srt"
260
+ Requires-Dist: sglang[runtime_common]; extra == "srt"
244
261
  Requires-Dist: torch; extra == "srt"
245
- Requires-Dist: torchao; extra == "srt"
246
- Requires-Dist: uvicorn; extra == "srt"
247
- Requires-Dist: uvloop; extra == "srt"
248
- Requires-Dist: zmq; extra == "srt"
249
262
  Requires-Dist: vllm==0.5.5; extra == "srt"
250
- Requires-Dist: outlines>=0.0.44; extra == "srt"
251
- Requires-Dist: modelscope; extra == "srt"
263
+ Provides-Extra: srt_xpu
264
+ Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
252
265
  Provides-Extra: test
253
266
  Requires-Dist: jsonlines; extra == "test"
254
267
  Requires-Dist: matplotlib; extra == "test"
@@ -257,8 +270,8 @@ Requires-Dist: sentence-transformers; extra == "test"
257
270
  Requires-Dist: accelerate; extra == "test"
258
271
  Requires-Dist: peft; extra == "test"
259
272
 
260
- <div align="center">
261
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
273
+ <div align="center" id="sglangtop">
274
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
262
275
 
263
276
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
264
277
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -270,15 +283,13 @@ Requires-Dist: peft; extra == "test"
270
283
 
271
284
  --------------------------------------------------------------------------------
272
285
 
273
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
274
-
275
- ## Upcoming Events
276
- - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
277
- - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
286
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
287
+ [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
278
288
 
279
289
  ## News
280
- - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
281
- - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
290
+ - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
291
+ - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
292
+ - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
282
293
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
283
294
 
284
295
  <details>
@@ -324,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
324
335
  ### Method 2: From source
325
336
  ```
326
337
  # Use the last release branch
327
- git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
338
+ git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
328
339
  cd sglang
329
340
 
330
341
  pip install --upgrade pip
@@ -501,6 +512,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
501
512
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
502
513
  ```
503
514
 
515
+ ### Engine Without HTTP Server
516
+
517
+ We also provide an inference engine **without a HTTP server**. For example,
518
+
519
+ ```python
520
+ import sglang as sgl
521
+
522
+
523
+ def main():
524
+ prompts = [
525
+ "Hello, my name is",
526
+ "The president of the United States is",
527
+ "The capital of France is",
528
+ "The future of AI is",
529
+ ]
530
+ sampling_params = {"temperature": 0.8, "top_p": 0.95}
531
+ llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
532
+
533
+ outputs = llm.generate(prompts, sampling_params)
534
+ for prompt, output in zip(prompts, outputs):
535
+ print("===============================")
536
+ print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
537
+
538
+ if __name__ == "__main__":
539
+ main()
540
+ ```
541
+
542
+ This can be used for:
543
+
544
+ 1. **Offline Batch Inference**
545
+ 2. **Building Custom Servers**
546
+
547
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
548
+
504
549
  ### Supported Models
505
550
 
506
551
  **Generative Models**
@@ -837,10 +882,7 @@ def chat_example(s):
837
882
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
838
883
 
839
884
  ## Benchmark And Performance
840
- ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
841
- ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
842
-
843
- Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
885
+ Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
844
886
 
845
887
  ## Roadmap
846
888
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -848,3 +890,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
848
890
  ## Citation And Acknowledgment
849
891
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
850
892
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
893
+
894
+
895
+
896
+ <p align="center">
897
+ <a href="#sglangtop" target="_blank">
898
+ <bold>Back To Top </bold>
899
+ </a>
900
+ </p>
@@ -0,0 +1,143 @@
1
+ sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
2
+ sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
3
+ sglang/bench_latency.py,sha256=RWSyZ-UhLV6dyPMMtK3nSOoNsjCY5xMpYKeUKRNtdcA,18276
4
+ sglang/bench_server_latency.py,sha256=2AMPwU2_85q-Btz9UdZC-TnZJPgXcNkydvFYWn2CJlU,5892
5
+ sglang/bench_serving.py,sha256=jcxNP7reIJPh3x1hG5TCM6wMlDXshjyMJUUjL2O7kzs,40060
6
+ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
7
+ sglang/global_config.py,sha256=1r_W9rrBxGCCc2eqESRduOMMNq46e54xLgFLifHuQm0,1014
8
+ sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
9
+ sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
10
+ sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
11
+ sglang/version.py,sha256=oYLGMpySamd16KLiaBTfRyrAS7_oyp-TOEHmzmeumwg,22
12
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
14
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
15
+ sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
16
+ sglang/lang/interpreter.py,sha256=zakc6IkzATaMqVDWKWvqDRrqnRykxFawajA7aUHUDbI,30640
17
+ sglang/lang/ir.py,sha256=F_9ac10OjktxR7KhOV07wiJXV20s79cRfh9d4koExJc,18262
18
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
19
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
21
+ sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
22
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
23
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
24
+ sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
25
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
26
+ sglang/srt/conversation.py,sha256=B4QPGOUVdoXRJwWWxSm5pfifGpuBs07fDTxJ1BHUXLw,20003
27
+ sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
28
+ sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
29
+ sglang/srt/server.py,sha256=65b39k4FN_TzL8qAimS1mRx8xdO8jmKCdUftOISUv7M,26809
30
+ sglang/srt/server_args.py,sha256=IDuX8ZCJd_6t2xHf7wNGskVgvpAQtUcFSGBnKFnLf3U,27290
31
+ sglang/srt/utils.py,sha256=0zalNeGrtrIyfmD7DHLRqocCY1_yNbPCD5hmionHpL0,23071
32
+ sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
33
+ sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
34
+ sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
35
+ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
36
+ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
37
+ sglang/srt/constrained/fsm_cache.py,sha256=9GtliIN55Ov8Q9MSFfQC5rKrz3qTsB7Cm5OkhivKngY,3271
38
+ sglang/srt/constrained/jump_forward.py,sha256=o-CzJu3DEs0eFKlLzsQVYMSo4vBKpffs25sXLOJd6jc,6997
39
+ sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
40
+ sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
41
+ sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
42
+ sglang/srt/layers/logits_processor.py,sha256=Fq7VHwjP4iSzl_OBLo8qw_HVbIDbYB-0MGmfiD3Jk_E,12521
43
+ sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
44
+ sglang/srt/layers/radix_attention.py,sha256=i07VRXPDHj-zJ1TSrXEqCxumQwYSHwAvc8DoIg-Irtg,1964
45
+ sglang/srt/layers/sampler.py,sha256=23wRDw2Fs3wZfPBh6gFBz2vqwxnoDd9LAHWq7YdQWlc,4166
46
+ sglang/srt/layers/torchao_utils.py,sha256=1nzZkSzbF4qCAMeBKAeeDpMl_mK8imiY2RL3xFEgvAw,3340
47
+ sglang/srt/layers/attention/__init__.py,sha256=hyrPpnuiTs5VIZNyoIjZLRsHUX20gX2dvY9kkqyXIII,2158
48
+ sglang/srt/layers/attention/double_sparsity_backend.py,sha256=owzPwLWcShZ0ezkVjBr0vV73vtQIUh8z-rcQtXLG1fk,10374
49
+ sglang/srt/layers/attention/flashinfer_backend.py,sha256=y9saTqOhb_tJoRtjq9sishlQFGYlFkQD7QcV9x_ureo,19554
50
+ sglang/srt/layers/attention/triton_backend.py,sha256=B6YuIZWh4Zn85Q57i0z3eZ08RCrS0rfyPJzkRr_zOIQ,6150
51
+ sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
52
+ sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
53
+ sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=nEG7iBh1pAy3WaqPdLZwCJwDgyk5HLQ181kBS2nxbwg,11179
54
+ sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=bNHHZeEowwI1wwOWj2T6bjBTBtVZUbcL-0cgfZwpHek,5471
55
+ sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
56
+ sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
57
+ sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
58
+ sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
59
+ sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
60
+ sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
61
+ sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
62
+ sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
63
+ sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
64
+ sglang/srt/managers/data_parallel_controller.py,sha256=GJGfX1-5DoQFZ-EMh_p02nvrOtrOc0UebnULWHhFrss,5765
65
+ sglang/srt/managers/detokenizer_manager.py,sha256=G2-Y-nDbq7LF8ZFWcXXcMkbCwzyBEh1g4UrciDlkNYY,7985
66
+ sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
67
+ sglang/srt/managers/io_struct.py,sha256=QJ1Eu7XNsRH35ec5nUOUS2XSjiFWyjYHKsnMqviC_Mk,12298
68
+ sglang/srt/managers/schedule_batch.py,sha256=6LUXbAd6PvHIDk6iwpsufosahIT2z32JZjsNx5xeKcg,33930
69
+ sglang/srt/managers/schedule_policy.py,sha256=unDmK7Y_Ti0Eiizh3_iEFMsC1KDqGMTqU8MlQgg-6qo,11951
70
+ sglang/srt/managers/scheduler.py,sha256=Y7R-VkLt8Az2jZGrGRuhG1g4UPO5y-7b9BaOknuC2aI,43019
71
+ sglang/srt/managers/tokenizer_manager.py,sha256=SprHC0Bs8kvtnYboDPrH587uO_sdKHyp7tVBCdbEB9c,25066
72
+ sglang/srt/managers/tp_worker.py,sha256=nwkIXiasGA4w97pnMG32U1DN1RlLFkvETvl9q7SjGeY,8887
73
+ sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
74
+ sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
75
+ sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
76
+ sglang/srt/mem_cache/memory_pool.py,sha256=ihVZXlJ_Fvs1L2c2SZQaijUYSn9X6eyiFiG2NNRQS_M,9297
77
+ sglang/srt/mem_cache/radix_cache.py,sha256=cS6G5uOW_0QICH30PXxatetka4wnELfhP4czHn8RDJE,10414
78
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=KgSBvoF2IyCGDFNXQyN7sV3E_S2NndeMQyaQZB97Pak,10499
79
+ sglang/srt/model_executor/forward_batch_info.py,sha256=rSazAtkWKyc2g2QILT2-AsUdaBt51hQBU1qcS2iw_Nw,5690
80
+ sglang/srt/model_executor/model_runner.py,sha256=hn-VeLABZL4y_GcJFl2y_C7x6ZS_Xw0tDyKzOxYMtVQ,25983
81
+ sglang/srt/models/baichuan.py,sha256=uV20fr7SqlATxoziXRtJyXFnZZoWTUy3qvQNFaEvw8M,15014
82
+ sglang/srt/models/chatglm.py,sha256=uep4Wy_2jwn_x6Cvagt5rs3JRY_AlcM-VXvRTCFC5Tc,13172
83
+ sglang/srt/models/commandr.py,sha256=WIMwjV3C0pRbVs4Xv9tqnHGreRvWC7zsML2hNuXw4A0,14060
84
+ sglang/srt/models/dbrx.py,sha256=_DshXyXr_xVB7wtE28PFcb6KDIkA6gygkXYKqivSCFc,14554
85
+ sglang/srt/models/deepseek.py,sha256=W342tVpEpkc_fvO_DTP4fX3EGF-DIFC4QOySdUGzl9w,15837
86
+ sglang/srt/models/deepseek_v2.py,sha256=5P5678aaMT4iO4jS0dZWUiRG4o9EE98xVgs6Zjy-Mr0,28229
87
+ sglang/srt/models/exaone.py,sha256=bIXdAXoWlCdfDdX2q47Br3QOa3jEYiiP2Hdd1T4crnM,12993
88
+ sglang/srt/models/gemma.py,sha256=4MVHwc5Jc4CSg3HIdNJEBYk8mhspjuwvc_6Oi8Cd-g8,12202
89
+ sglang/srt/models/gemma2.py,sha256=3VL223T_3syBG3fUInbtFaXvIs7dYjtsfX3OfDQc7m4,14777
90
+ sglang/srt/models/gpt_bigcode.py,sha256=q9N13Js2v0VheudWssRoSjnptS6TSf7DOmC8zLRGxeo,10049
91
+ sglang/srt/models/grok.py,sha256=vc7-E_hemNKaNORxg4rmaQcVYlpoavyaAZUG9B2dgbY,14835
92
+ sglang/srt/models/internlm2.py,sha256=-liQB13sgR3GnXJacBSMuEbLa2N4tICx0LsNgu_nNvU,12108
93
+ sglang/srt/models/llama.py,sha256=a43Y5mvMDmFcRcPL78vsAElaOvTqPajLPB2_BDwJ7pM,15767
94
+ sglang/srt/models/llama_classification.py,sha256=WcHYFez7qloTCpXLy1A6-dBGHWp22ebv6yG68jFVBjc,3318
95
+ sglang/srt/models/llama_embedding.py,sha256=4j3WNLB-x7XQnJvohdRs7VSSEabbhiE2BRHmnG5IZRU,3453
96
+ sglang/srt/models/llama_reward.py,sha256=ag3eVdP38iURj81fTCa-sC2jV_eCkTIjXUQf1I96fCI,5297
97
+ sglang/srt/models/llava.py,sha256=ny3sK2sgYwrEhawSAc1tZeltcgukphSTdxsqyq-Epkc,24857
98
+ sglang/srt/models/llavavid.py,sha256=ztS5He-NF4fmfujdoMnKljOG1fNfPvp-6bduT7B6EMU,12137
99
+ sglang/srt/models/minicpm.py,sha256=LpUdxKA27z79DSYAPPlfCgI4GEnWCYznhgSQl-QCsTY,13731
100
+ sglang/srt/models/minicpm3.py,sha256=-fLZ-RRbR2jLGSsatBWV-qsSNIZCPbS_jasmrOlUdK8,25023
101
+ sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
102
+ sglang/srt/models/mixtral.py,sha256=UUqzpOBXsObirmpJz4xstlG82uu4JfXsh-gWQmiKbW0,13803
103
+ sglang/srt/models/mixtral_quant.py,sha256=HPipVG_Gc5Ki0YXg49Rwn2_uvtCCI1IxlA7mVRVFivw,13978
104
+ sglang/srt/models/olmo.py,sha256=lD4VewXK0rVqhttGkOOzaxoqRQgVfV90s8ElStPBBdE,11896
105
+ sglang/srt/models/olmoe.py,sha256=3qHnY1DWBhyx9FWGJGb3a8kewcmEdYZOkYZ1JBx1LWs,15251
106
+ sglang/srt/models/qwen.py,sha256=mjGqo3NkTYfJ2qqztFw8mjKggPT2moW15nQgrq3GxWk,9860
107
+ sglang/srt/models/qwen2.py,sha256=I2ZzH9pVTZdjP1fHlq1qdG4JiWHt1CC6t1EK2gN5Ppc,12337
108
+ sglang/srt/models/qwen2_moe.py,sha256=BaNq8xgZKqjr_fcEBtH4yjBSc3-p4VztPiknVwllcQk,16984
109
+ sglang/srt/models/stablelm.py,sha256=0NWUVsYGhbc_X2eT9x38MaaUhZGmFtMgw_2PBv25Yxw,11265
110
+ sglang/srt/models/torch_native_llama.py,sha256=dtasdhwfRPE1eOcAIFUBsHrDnkjegXvo8WhGlqvXGKk,19154
111
+ sglang/srt/models/xverse.py,sha256=v4OaFdss9oD5YNzXsnjoXE9ffCkXL9U5o0OWLm1vHQQ,13573
112
+ sglang/srt/models/xverse_moe.py,sha256=A8EB82NpozoBplp7Qd8B_kY_3cL-UMydAxYIrhACVPE,15682
113
+ sglang/srt/models/yivl.py,sha256=xcWqkuZ29FmBBJY6aKetwItWIPl-kfXK-QmgdLONles,4765
114
+ sglang/srt/openai_api/adapter.py,sha256=WkYCKVaYTkFdLrySBhlkDyHJVaaHMF7KrhNnmw3L3us,53534
115
+ sglang/srt/openai_api/protocol.py,sha256=EZ6G209rBEDP7cepO2kAYqE8wMe1ksYdN7to1iT97Lw,10248
116
+ sglang/srt/sampling/sampling_batch_info.py,sha256=EAdep3I5qmbDDQJ0Ktrq0ySXJ6DCrTAjniEwFu4ZRqE,7679
117
+ sglang/srt/sampling/sampling_params.py,sha256=ZPHCQq7Bi4P_sxUzdKgYVXZpB_tC-kA7rlLwiW9Ct9A,5781
118
+ sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
119
+ sglang/srt/sampling/penaltylib/orchestrator.py,sha256=kizcPnxtRawmDt6utRuhbk4yfNs5H5mx1DAlDVEZRv8,11328
120
+ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
121
+ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
122
+ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
123
+ sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
124
+ sglang/test/few_shot_gsm8k.py,sha256=ll-gNbcv829IwSPXAZt4JIEIu8IR3APCLcX3BHOFVp8,3968
125
+ sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
126
+ sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
127
+ sglang/test/runners.py,sha256=VCmtH08FsAq_JTAKfKo0zB4o-osNMAxxwe4aKcSxr4c,13515
128
+ sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
129
+ sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
130
+ sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
131
+ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
132
+ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
133
+ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
134
+ sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
135
+ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
136
+ sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
137
+ sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
138
+ sglang/test/srt/sampling/penaltylib/utils.py,sha256=Koe8GYoxIBUCz71of0oHhM5t5QcEd6a1IYq5SszRFAw,12730
139
+ sglang-0.3.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
140
+ sglang-0.3.4.dist-info/METADATA,sha256=rrkwX2teVdp79NEuOJfTBPUYCs_72LHIabuIesToPdI,40738
141
+ sglang-0.3.4.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
142
+ sglang-0.3.4.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
143
+ sglang-0.3.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,237 +0,0 @@
1
- from enum import Enum, auto
2
-
3
- import torch
4
- import triton
5
- import triton.language as tl
6
-
7
-
8
- class WrapperDispatch(Enum):
9
- SLIDING_WINDOW = auto()
10
- CROSS_ATTENTION = auto()
11
-
12
-
13
- @triton.jit
14
- def create_flashinfer_kv_indices_triton(
15
- req_to_token_ptr, # [max_batch, max_context_len]
16
- req_pool_indices_ptr,
17
- page_kernel_lens_ptr,
18
- kv_indptr,
19
- kv_start_idx,
20
- kv_indices_ptr,
21
- max_context_len: tl.constexpr,
22
- ):
23
- BLOCK_SIZE: tl.constexpr = 512
24
- pid = tl.program_id(axis=0)
25
- req_pool_index = tl.load(req_pool_indices_ptr + pid)
26
- kv_indices_offset = tl.load(kv_indptr + pid)
27
-
28
- kv_start = 0
29
- kv_end = 0
30
- if kv_start_idx:
31
- kv_start = tl.load(kv_start_idx + pid).to(tl.int32)
32
- kv_end = kv_start
33
- kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
34
-
35
- req_to_token_ptr += req_pool_index * max_context_len
36
- kv_indices_ptr += kv_indices_offset
37
-
38
- ld_offset = kv_start + tl.arange(0, BLOCK_SIZE)
39
- st_offset = tl.arange(0, BLOCK_SIZE)
40
- num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
41
- for _ in range(num_loop):
42
- mask = ld_offset < kv_end
43
- data = tl.load(req_to_token_ptr + ld_offset, mask=mask)
44
- tl.store(kv_indices_ptr + st_offset, data, mask=mask)
45
- ld_offset += BLOCK_SIZE
46
- st_offset += BLOCK_SIZE
47
-
48
-
49
- class FlashinferUpdater:
50
- def __init__(
51
- self,
52
- forward_mode,
53
- model_runner,
54
- req_pool_indices,
55
- seq_lens,
56
- prefix_lens,
57
- decode_wrappers=None,
58
- use_ragged=False,
59
- ):
60
- self.forward_mode = forward_mode
61
- self.model_runner = model_runner
62
- self.req_pool_indices = req_pool_indices
63
- self.seq_lens = seq_lens
64
- self.prefix_lens = prefix_lens
65
- self.use_ragged = use_ragged
66
-
67
- self.num_qo_heads = (
68
- model_runner.model_config.num_attention_heads // model_runner.tp_size
69
- )
70
- self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
71
- model_runner.tp_size
72
- )
73
- self.head_dim = model_runner.model_config.head_dim
74
- self.batch_size = len(req_pool_indices)
75
-
76
- self.decode_wrappers = (
77
- decode_wrappers or self.model_runner.attn_backend.decode_wrappers
78
- )
79
- self.prefill_wrapper_ragged = (
80
- self.model_runner.attn_backend.prefill_wrapper_ragged
81
- )
82
- self.prefill_wrappers_paged = (
83
- self.model_runner.attn_backend.prefill_wrappers_paged
84
- )
85
-
86
- self.kv_last_page_len = torch.ones(
87
- (self.batch_size,), dtype=torch.int32, device="cuda"
88
- )
89
-
90
- def _update_decode_indices(self, decode_wrapper):
91
- assert not isinstance(decode_wrapper, list)
92
- decode_wrapper.end_forward()
93
- decode_wrapper.begin_forward(
94
- self.kv_indptr,
95
- self.kv_indices,
96
- self.kv_last_page_len,
97
- self.num_qo_heads,
98
- self.num_kv_heads,
99
- self.head_dim,
100
- 1,
101
- data_type=self.model_runner.kv_cache_dtype,
102
- q_data_type=self.model_runner.dtype,
103
- )
104
-
105
- def _update_extend_indices(self, ragged_wrapper, paged_wrapper):
106
- assert not isinstance(paged_wrapper, list)
107
- assert not isinstance(ragged_wrapper, list)
108
-
109
- # extend part
110
- qo_indptr = torch.zeros(
111
- (self.batch_size + 1,), dtype=torch.int32, device="cuda"
112
- )
113
- qo_indptr[1:] = torch.cumsum(self.seq_lens - self.prefix_lens, dim=0)
114
-
115
- if self.use_ragged:
116
- ragged_wrapper.end_forward()
117
- ragged_wrapper.begin_forward(
118
- qo_indptr,
119
- qo_indptr,
120
- self.num_qo_heads,
121
- self.num_kv_heads,
122
- self.head_dim,
123
- )
124
-
125
- # cached part
126
- paged_wrapper.end_forward()
127
- paged_wrapper.begin_forward(
128
- qo_indptr,
129
- self.kv_indptr,
130
- self.kv_indices,
131
- self.kv_last_page_len,
132
- self.num_qo_heads,
133
- self.num_kv_heads,
134
- self.head_dim,
135
- 1,
136
- )
137
-
138
- def _get_indices(self, dispatch_reason: WrapperDispatch = None, wrapper_id=0):
139
- if dispatch_reason is None:
140
- if self.use_ragged:
141
- paged_kernel_lens = self.prefix_lens
142
- else:
143
- paged_kernel_lens = self.seq_lens
144
- self.kv_start_idx = None
145
- elif dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
146
- if wrapper_id == 0:
147
- # window attention use paged only
148
- if self.forward_mode.is_decode():
149
- paged_kernel_lens = torch.minimum(
150
- self.seq_lens,
151
- torch.tensor(self.model_runner.sliding_window_size + 1),
152
- )
153
- else:
154
- paged_kernel_lens = torch.minimum(
155
- self.seq_lens,
156
- torch.tensor(self.model_runner.sliding_window_size)
157
- + self.seq_lens
158
- - self.prefix_lens,
159
- )
160
- else:
161
- # full attention
162
- paged_kernel_lens = self.seq_lens
163
- self.kv_start_idx = self.seq_lens - paged_kernel_lens
164
-
165
- self.kv_indptr = torch.zeros(
166
- (self.batch_size + 1,), dtype=torch.int32, device="cuda"
167
- )
168
- self.kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
169
- self.kv_indices = torch.empty(
170
- self.kv_indptr[-1], dtype=torch.int32, device="cuda"
171
- )
172
-
173
- create_flashinfer_kv_indices_triton[(self.batch_size,)](
174
- self.model_runner.req_to_token_pool.req_to_token,
175
- self.req_pool_indices,
176
- paged_kernel_lens,
177
- self.kv_indptr,
178
- self.kv_start_idx,
179
- self.kv_indices,
180
- self.model_runner.req_to_token_pool.req_to_token.size(1),
181
- )
182
-
183
- def _update_indicess_single_wrapper(self):
184
- self._get_indices()
185
-
186
- if self.forward_mode.is_decode():
187
- self._update_decode_indices(self.decode_wrappers[0])
188
- else:
189
- self._update_extend_indices(
190
- self.prefill_wrapper_ragged,
191
- self.prefill_wrappers_paged[0],
192
- )
193
-
194
- def _update_indices_cross_attention(self):
195
- pass
196
-
197
- def _update_indices_sliding_window(self):
198
- assert self.use_ragged is False
199
- for wrapper_id in range(2):
200
- self._get_indices(WrapperDispatch.SLIDING_WINDOW, wrapper_id)
201
- if self.forward_mode.is_decode():
202
- self._update_decode_indices(self.decode_wrappers[wrapper_id])
203
- else:
204
- self._update_extend_indices(
205
- None,
206
- self.prefill_wrappers_paged[wrapper_id],
207
- )
208
-
209
-
210
- def update_flashinfer_indices(
211
- forward_mode,
212
- model_runner,
213
- req_pool_indices,
214
- seq_lens,
215
- prefix_lens,
216
- decode_wrappers=None,
217
- use_ragged=False,
218
- ):
219
- updater = FlashinferUpdater(
220
- forward_mode,
221
- model_runner,
222
- req_pool_indices,
223
- seq_lens,
224
- prefix_lens,
225
- decode_wrappers,
226
- use_ragged,
227
- )
228
-
229
- dispatch_reason = model_runner.attn_backend.dispatch_reason
230
-
231
- if dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
232
- updater._update_indices_sliding_window()
233
- elif dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
234
- updater._update_indices_cross_attention()
235
- else:
236
- assert model_runner.attn_backend.num_wrappers == 1
237
- updater._update_indicess_single_wrapper()