sglang 0.3.4__tar.gz → 0.3.4.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. {sglang-0.3.4/sglang.egg-info → sglang-0.3.4.post1}/PKG-INFO +6 -6
  2. {sglang-0.3.4 → sglang-0.3.4.post1}/README.md +4 -4
  3. {sglang-0.3.4 → sglang-0.3.4.post1}/pyproject.toml +30 -11
  4. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/bench_latency.py +2 -1
  5. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/chat_template.py +17 -0
  6. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/launch_server_llavavid.py +1 -1
  7. sglang-0.3.4.post1/sglang/srt/configs/__init__.py +8 -0
  8. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/configs/model_config.py +2 -0
  9. sglang-0.3.4.post1/sglang/srt/configs/qwen2vl.py +133 -0
  10. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/conversation.py +27 -0
  11. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/hf_transformers_utils.py +2 -1
  12. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/__init__.py +16 -5
  13. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
  14. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/flashinfer_backend.py +171 -51
  15. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_backend.py +22 -6
  16. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
  17. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/linear.py +89 -63
  18. sglang-0.3.4.post1/sglang/srt/layers/rotary_embedding.py +145 -0
  19. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/lora/lora.py +3 -1
  20. sglang-0.3.4.post1/sglang/srt/managers/image_processor.py +360 -0
  21. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/schedule_batch.py +225 -65
  22. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/scheduler.py +61 -48
  23. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/tokenizer_manager.py +10 -4
  24. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/tp_worker.py +26 -111
  25. sglang-0.3.4.post1/sglang/srt/managers/tp_worker_overlap_thread.py +172 -0
  26. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/mem_cache/memory_pool.py +47 -8
  27. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/mem_cache/radix_cache.py +4 -3
  28. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/model_executor/cuda_graph_runner.py +59 -8
  29. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/model_executor/forward_batch_info.py +86 -3
  30. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/model_executor/model_runner.py +32 -11
  31. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/chatglm.py +3 -3
  32. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/deepseek_v2.py +2 -2
  33. sglang-0.3.4.post1/sglang/srt/models/mllama.py +1004 -0
  34. sglang-0.3.4.post1/sglang/srt/models/qwen2_vl.py +724 -0
  35. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/sampling_batch_info.py +13 -3
  36. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/server_args.py +10 -0
  37. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/utils.py +22 -0
  38. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/runners.py +20 -1
  39. sglang-0.3.4.post1/sglang/version.py +1 -0
  40. {sglang-0.3.4 → sglang-0.3.4.post1/sglang.egg-info}/PKG-INFO +6 -6
  41. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang.egg-info/SOURCES.txt +5 -0
  42. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang.egg-info/requires.txt +1 -1
  43. sglang-0.3.4/sglang/srt/configs/__init__.py +0 -5
  44. sglang-0.3.4/sglang/srt/managers/image_processor.py +0 -187
  45. sglang-0.3.4/sglang/version.py +0 -1
  46. {sglang-0.3.4 → sglang-0.3.4.post1}/LICENSE +0 -0
  47. {sglang-0.3.4 → sglang-0.3.4.post1}/setup.cfg +0 -0
  48. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/__init__.py +0 -0
  49. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/api.py +0 -0
  50. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/bench_server_latency.py +0 -0
  51. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/bench_serving.py +0 -0
  52. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/check_env.py +0 -0
  53. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/global_config.py +0 -0
  54. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/__init__.py +0 -0
  55. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/__init__.py +0 -0
  56. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/anthropic.py +0 -0
  57. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/base_backend.py +0 -0
  58. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/litellm.py +0 -0
  59. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/openai.py +0 -0
  60. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  61. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/backend/vertexai.py +0 -0
  62. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/choices.py +0 -0
  63. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/compiler.py +0 -0
  64. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/interpreter.py +0 -0
  65. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/ir.py +0 -0
  66. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/lang/tracer.py +0 -0
  67. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/launch_server.py +0 -0
  68. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/configs/exaone.py +0 -0
  69. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/constrained/__init__.py +0 -0
  70. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
  71. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/constrained/fsm_cache.py +0 -0
  72. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/constrained/jump_forward.py +0 -0
  73. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/activation.py +0 -0
  74. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  75. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  76. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  77. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  78. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  79. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
  80. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
  81. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/layernorm.py +0 -0
  82. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/logits_processor.py +0 -0
  83. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/pooler.py +0 -0
  84. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
  85. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  86. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/radix_attention.py +0 -0
  87. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/sampler.py +0 -0
  88. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/layers/torchao_utils.py +0 -0
  89. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/lora/lora_config.py +0 -0
  90. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/lora/lora_manager.py +0 -0
  91. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  92. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  93. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/io_struct.py +0 -0
  94. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  95. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  96. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  97. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  98. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/mm_utils.py +0 -0
  99. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/baichuan.py +0 -0
  100. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/commandr.py +0 -0
  101. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/dbrx.py +0 -0
  102. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/deepseek.py +0 -0
  103. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/exaone.py +0 -0
  104. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/gemma.py +0 -0
  105. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/gemma2.py +0 -0
  106. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  107. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/grok.py +0 -0
  108. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/internlm2.py +0 -0
  109. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/llama.py +0 -0
  110. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/llama_classification.py +0 -0
  111. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/llama_embedding.py +0 -0
  112. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/llama_reward.py +0 -0
  113. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/llava.py +0 -0
  114. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/llavavid.py +0 -0
  115. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/minicpm.py +0 -0
  116. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/minicpm3.py +0 -0
  117. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/mistral.py +0 -0
  118. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/mixtral.py +0 -0
  119. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  120. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/olmo.py +0 -0
  121. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/olmoe.py +0 -0
  122. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/qwen.py +0 -0
  123. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/qwen2.py +0 -0
  124. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  125. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/stablelm.py +0 -0
  126. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/torch_native_llama.py +0 -0
  127. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/xverse.py +0 -0
  128. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/xverse_moe.py +0 -0
  129. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/models/yivl.py +0 -0
  130. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/openai_api/adapter.py +0 -0
  131. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/openai_api/protocol.py +0 -0
  132. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  133. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  134. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  135. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  136. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  137. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  138. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  139. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/srt/server.py +0 -0
  140. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  141. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  142. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/run_eval.py +0 -0
  143. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/simple_eval_common.py +0 -0
  144. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  145. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  146. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/simple_eval_math.py +0 -0
  147. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  148. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  149. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  150. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/test_activation.py +0 -0
  151. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/test_layernorm.py +0 -0
  152. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/test_programs.py +0 -0
  153. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/test/test_utils.py +0 -0
  154. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang/utils.py +0 -0
  155. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang.egg-info/dependency_links.txt +0 -0
  156. {sglang-0.3.4 → sglang-0.3.4.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.4
3
+ Version: 0.3.4.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -236,7 +236,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
236
236
  Provides-Extra: srt
237
237
  Requires-Dist: sglang[runtime_common]; extra == "srt"
238
238
  Requires-Dist: torch; extra == "srt"
239
- Requires-Dist: vllm==0.5.5; extra == "srt"
239
+ Requires-Dist: vllm==0.6.3.post1; extra == "srt"
240
240
  Provides-Extra: srt-xpu
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
242
242
  Provides-Extra: openai
@@ -284,17 +284,17 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
284
284
  --------------------------------------------------------------------------------
285
285
 
286
286
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
287
- [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
287
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
288
288
 
289
289
  ## News
290
290
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
291
291
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
292
292
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
293
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
294
293
 
295
294
  <details>
296
295
  <summary>More</summary>
297
296
 
297
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
298
298
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
299
299
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
300
300
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -335,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
335
335
  ### Method 2: From source
336
336
  ```
337
337
  # Use the last release branch
338
- git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
338
+ git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
339
339
  cd sglang
340
340
 
341
341
  pip install --upgrade pip
@@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
575
575
  - MiniCPM / MiniCPM 3
576
576
  - XVERSE / XVERSE MoE
577
577
  - SmolLM
578
+ - GLM-4
578
579
 
579
580
  **Embedding Models**
580
581
 
@@ -892,7 +893,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
892
893
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
893
894
 
894
895
 
895
-
896
896
  <p align="center">
897
897
  <a href="#sglangtop" target="_blank">
898
898
  <bold>Back To Top </bold>
@@ -12,17 +12,17 @@
12
12
  --------------------------------------------------------------------------------
13
13
 
14
14
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
15
- [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
15
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
16
16
 
17
17
  ## News
18
18
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
19
19
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
20
20
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
21
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
22
21
 
23
22
  <details>
24
23
  <summary>More</summary>
25
24
 
25
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
26
26
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
27
27
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
28
28
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -63,7 +63,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
63
63
  ### Method 2: From source
64
64
  ```
65
65
  # Use the last release branch
66
- git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
66
+ git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
67
67
  cd sglang
68
68
 
69
69
  pip install --upgrade pip
@@ -303,6 +303,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
303
303
  - MiniCPM / MiniCPM 3
304
304
  - XVERSE / XVERSE MoE
305
305
  - SmolLM
306
+ - GLM-4
306
307
 
307
308
  **Embedding Models**
308
309
 
@@ -620,7 +621,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
620
621
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
621
622
 
622
623
 
623
-
624
624
  <p align="center">
625
625
  <a href="#sglangtop" target="_blank">
626
626
  <bold>Back To Top </bold>
@@ -4,20 +4,16 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.4"
7
+ version = "0.3.4.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
- license = {file = "LICENSE"}
11
+ license = { file = "LICENSE" }
12
12
  classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: Apache Software License",
15
15
  ]
16
- dependencies = [
17
- "requests",
18
- "tqdm",
19
- "numpy",
20
- ]
16
+ dependencies = ["requests", "tqdm", "numpy"]
21
17
 
22
18
  [project.optional-dependencies]
23
19
  runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
@@ -26,13 +22,20 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
26
22
  "outlines>=0.0.44", "modelscope"]
27
23
  # xpu is not enabled in public vllm and torch whl,
28
24
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
29
- srt = ["sglang[runtime_common]", "torch", "vllm==0.5.5"]
25
+ srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
30
26
  srt_xpu = ["sglang[runtime_common]"]
31
27
 
32
28
  openai = ["openai>=1.0", "tiktoken"]
33
29
  anthropic = ["anthropic>=0.20.0"]
34
30
  litellm = ["litellm>=1.0.0"]
35
- test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate", "peft"]
31
+ test = [
32
+ "jsonlines",
33
+ "matplotlib",
34
+ "pandas",
35
+ "sentence_transformers",
36
+ "accelerate",
37
+ "peft",
38
+ ]
36
39
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
37
40
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
38
41
  dev = ["sglang[all]", "sglang[test]"]
@@ -43,7 +46,23 @@ dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
43
46
  "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
44
47
 
45
48
  [tool.setuptools.packages.find]
46
- exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
49
+ exclude = [
50
+ "assets*",
51
+ "benchmark*",
52
+ "docs*",
53
+ "dist*",
54
+ "playground*",
55
+ "scripts*",
56
+ "tests*",
57
+ ]
47
58
 
48
59
  [tool.wheel]
49
- exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
60
+ exclude = [
61
+ "assets*",
62
+ "benchmark*",
63
+ "docs*",
64
+ "dist*",
65
+ "playground*",
66
+ "scripts*",
67
+ "tests*",
68
+ ]
@@ -227,8 +227,9 @@ def extend(reqs, model_runner):
227
227
  req_to_token_pool=model_runner.req_to_token_pool,
228
228
  token_to_kv_pool=model_runner.token_to_kv_pool,
229
229
  tree_cache=None,
230
+ model_config=model_runner.model_config,
230
231
  )
231
- batch.prepare_for_extend(model_runner.model_config.vocab_size)
232
+ batch.prepare_for_extend()
232
233
  model_worker_batch = batch.get_model_worker_batch()
233
234
  forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
234
235
  logits_output = model_runner.forward(forward_batch)
@@ -133,6 +133,22 @@ register_chat_template(
133
133
  )
134
134
  )
135
135
 
136
+ # Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
137
+ register_chat_template(
138
+ ChatTemplate(
139
+ name="qwen2-vl",
140
+ default_system_prompt="You are a helpful assistant.",
141
+ role_prefix_and_suffix={
142
+ "system": ("<|im_start|>system\n", "<|im_end|>\n"),
143
+ "user": ("<|im_start|>user\n", "<|im_end|>\n"),
144
+ "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
145
+ },
146
+ style=ChatTemplateStyle.PLAIN,
147
+ stop_str=("<|im_end|>"),
148
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
149
+ )
150
+ )
151
+
136
152
 
137
153
  register_chat_template(
138
154
  ChatTemplate(
@@ -213,6 +229,7 @@ register_chat_template(
213
229
  ),
214
230
  },
215
231
  stop_str=("<|eot_id|>",),
232
+ image_token="<|image|>",
216
233
  )
217
234
  )
218
235
 
@@ -14,7 +14,7 @@ if __name__ == "__main__":
14
14
  model_override_args["num_frames"] = 16
15
15
  model_override_args["model_type"] = "llavavid"
16
16
  if model_override_args["num_frames"] == 32:
17
- model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
17
+ model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
18
18
  model_override_args["max_sequence_length"] = 4096 * 2
19
19
  model_override_args["tokenizer_model_max_length"] = 4096 * 2
20
20
  model_override_args["model_max_length"] = 4096 * 2
@@ -0,0 +1,8 @@
1
+ from sglang.srt.configs.exaone import ExaoneConfig
2
+ from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
3
+
4
+ __all__ = [
5
+ "ExaoneConfig",
6
+ "Qwen2VLConfig",
7
+ "Qwen2VLVisionConfig",
8
+ ]
@@ -89,6 +89,8 @@ class ModelConfig:
89
89
  self.num_hidden_layers = self.hf_text_config.num_hidden_layers
90
90
  self.vocab_size = self.hf_text_config.vocab_size
91
91
 
92
+ self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
93
+
92
94
  # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
93
95
  def get_total_num_kv_heads(self) -> int:
94
96
  """Returns the total number of KV heads."""
@@ -0,0 +1,133 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
3
+ # All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """Qwen2VL model configuration"""
17
+
18
+ import os
19
+ from typing import Union
20
+
21
+ from transformers import PretrainedConfig
22
+
23
+
24
+ class Qwen2VLVisionConfig(PretrainedConfig):
25
+ model_type = "qwen2_vl"
26
+
27
+ def __init__(
28
+ self,
29
+ depth=32,
30
+ embed_dim=1280,
31
+ hidden_size=3584,
32
+ hidden_act="quick_gelu",
33
+ mlp_ratio=4,
34
+ num_heads=16,
35
+ in_channels=3,
36
+ patch_size=14,
37
+ spatial_merge_size=2,
38
+ temporal_patch_size=2,
39
+ **kwargs,
40
+ ):
41
+ super().__init__(**kwargs)
42
+
43
+ self.depth = depth
44
+ self.embed_dim = embed_dim
45
+ self.hidden_size = hidden_size
46
+ self.hidden_act = hidden_act
47
+ self.mlp_ratio = mlp_ratio
48
+ self.num_heads = num_heads
49
+ self.in_channels = in_channels
50
+ self.patch_size = patch_size
51
+ self.spatial_merge_size = spatial_merge_size
52
+ self.temporal_patch_size = temporal_patch_size
53
+
54
+ @classmethod
55
+ def from_pretrained(
56
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
57
+ ) -> "PretrainedConfig":
58
+ cls._set_token_in_kwargs(kwargs)
59
+
60
+ config_dict, kwargs = cls.get_config_dict(
61
+ pretrained_model_name_or_path, **kwargs
62
+ )
63
+
64
+ if config_dict.get("model_type") == "qwen2_vl":
65
+ config_dict = config_dict["vision_config"]
66
+
67
+ return cls.from_dict(config_dict, **kwargs)
68
+
69
+
70
+ class Qwen2VLConfig(PretrainedConfig):
71
+ model_type = "qwen2_vl"
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_size=152064,
76
+ hidden_size=8192,
77
+ intermediate_size=29568,
78
+ num_hidden_layers=80,
79
+ num_attention_heads=64,
80
+ num_key_value_heads=8,
81
+ hidden_act="silu",
82
+ max_position_embeddings=32768,
83
+ initializer_range=0.02,
84
+ rms_norm_eps=1e-05,
85
+ use_cache=True,
86
+ tie_word_embeddings=False,
87
+ rope_theta=1000000.0,
88
+ use_sliding_window=False,
89
+ sliding_window=4096,
90
+ max_window_layers=80,
91
+ attention_dropout=0.0,
92
+ vision_config=None,
93
+ rope_scaling=None,
94
+ **kwargs,
95
+ ):
96
+ if isinstance(vision_config, dict):
97
+ self.vision_config = Qwen2VLVisionConfig(**vision_config)
98
+ elif vision_config is None:
99
+ self.vision_config = Qwen2VLVisionConfig()
100
+
101
+ self.vocab_size = vocab_size
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.hidden_size = hidden_size
104
+ self.intermediate_size = intermediate_size
105
+ self.num_hidden_layers = num_hidden_layers
106
+ self.num_attention_heads = num_attention_heads
107
+ self.use_sliding_window = use_sliding_window
108
+ self.sliding_window = sliding_window
109
+ self.max_window_layers = max_window_layers
110
+
111
+ # for backward compatibility
112
+ if num_key_value_heads is None:
113
+ num_key_value_heads = num_attention_heads
114
+
115
+ self.num_key_value_heads = num_key_value_heads
116
+ self.hidden_act = hidden_act
117
+ self.initializer_range = initializer_range
118
+ self.rms_norm_eps = rms_norm_eps
119
+ self.use_cache = use_cache
120
+ self.rope_theta = rope_theta
121
+ self.attention_dropout = attention_dropout
122
+ self.rope_scaling = rope_scaling
123
+
124
+ # NOTE: the following section from original transformers config
125
+ # for Qwen2-VL is commented out to address rope config loading issue
126
+ #
127
+ # if self.rope_scaling is not None and "type" in self.rope_scaling:
128
+ # if self.rope_scaling["type"] == "mrope":
129
+ # self.rope_scaling["type"] = "default"
130
+ # self.rope_scaling["rope_type"] = self.rope_scaling["type"]
131
+ # rope_config_validation(self)
132
+
133
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
@@ -509,6 +509,19 @@ register_conv_template(
509
509
  )
510
510
  )
511
511
 
512
+ register_conv_template(
513
+ Conversation(
514
+ name="llama_3_vision",
515
+ system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
516
+ system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
517
+ roles=("user", "assistant"),
518
+ sep_style=SeparatorStyle.LLAMA3,
519
+ sep="",
520
+ stop_str=["<|end_of_text|>", "<|eot_id|>"],
521
+ image_token="<|image|>",
522
+ )
523
+ )
524
+
512
525
  register_conv_template(
513
526
  Conversation(
514
527
  name="llava_llama_3",
@@ -530,3 +543,17 @@ register_conv_template(
530
543
  stop_str=["<|im_end|>", "<|action_end|>"],
531
544
  )
532
545
  )
546
+
547
+ # Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
548
+ register_conv_template(
549
+ Conversation(
550
+ name="qwen2-vl",
551
+ system_message="You are a helpful assistant.",
552
+ system_template="<|im_start|>system\n{system_message}",
553
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
554
+ sep="<|im_end|>\n",
555
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
556
+ stop_str=["<|im_end|>"],
557
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
558
+ )
559
+ )
@@ -33,12 +33,13 @@ from transformers import (
33
33
  try:
34
34
  from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
35
35
 
36
- from sglang.srt.configs import ExaoneConfig
36
+ from sglang.srt.configs import ExaoneConfig, Qwen2VLConfig
37
37
 
38
38
  _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
39
39
  ChatGLMConfig.model_type: ChatGLMConfig,
40
40
  DbrxConfig.model_type: DbrxConfig,
41
41
  ExaoneConfig.model_type: ExaoneConfig,
42
+ Qwen2VLConfig.model_type: Qwen2VLConfig,
42
43
  }
43
44
  except ImportError:
44
45
  # We want this file to run without vllm dependency
@@ -1,8 +1,10 @@
1
1
  from abc import ABC, abstractmethod
2
+ from typing import Optional
2
3
 
3
4
  import torch
4
5
  from torch import nn
5
6
 
7
+ from sglang.srt.layers.radix_attention import RadixAttention
6
8
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
7
9
 
8
10
 
@@ -19,13 +21,22 @@ class AttentionBackend(ABC):
19
21
  raise NotImplementedError()
20
22
 
21
23
  def init_forward_metadata_capture_cuda_graph(
22
- self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
24
+ self,
25
+ bs: int,
26
+ req_pool_indices: torch.Tensor,
27
+ seq_lens: torch.Tensor,
28
+ encoder_lens: Optional[torch.Tensor] = None,
23
29
  ):
24
30
  """Init the metadata for a forward pass for capturing a cuda graph."""
25
31
  raise NotImplementedError()
26
32
 
27
33
  def init_forward_metadata_replay_cuda_graph(
28
- self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
34
+ self,
35
+ bs: int,
36
+ req_pool_indices: torch.Tensor,
37
+ seq_lens: torch.Tensor,
38
+ seq_lens_sum: int,
39
+ encoder_lens: Optional[torch.Tensor] = None,
29
40
  ):
30
41
  """Init the metadata for a forward pass for replying a cuda graph."""
31
42
  raise NotImplementedError()
@@ -39,7 +50,7 @@ class AttentionBackend(ABC):
39
50
  q: torch.Tensor,
40
51
  k: torch.Tensor,
41
52
  v: torch.Tensor,
42
- layer: nn.Module,
53
+ layer: RadixAttention,
43
54
  forward_batch: ForwardBatch,
44
55
  ):
45
56
  """Run forward on an attention layer."""
@@ -53,7 +64,7 @@ class AttentionBackend(ABC):
53
64
  q: torch.Tensor,
54
65
  k: torch.Tensor,
55
66
  v: torch.Tensor,
56
- layer: nn.Module,
67
+ layer: RadixAttention,
57
68
  forward_batch: ForwardBatch,
58
69
  ):
59
70
  """Run a forward for decode."""
@@ -64,7 +75,7 @@ class AttentionBackend(ABC):
64
75
  q: torch.Tensor,
65
76
  k: torch.Tensor,
66
77
  v: torch.Tensor,
67
- layer: nn.Module,
78
+ layer: RadixAttention,
68
79
  forward_batch: ForwardBatch,
69
80
  ):
70
81
  """Run a forward for extend."""
@@ -10,6 +10,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
10
10
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
11
11
 
12
12
  if TYPE_CHECKING:
13
+ from sglang.srt.layers.radix_attention import RadixAttention
13
14
  from sglang.srt.model_executor.model_runner import ModelRunner
14
15
 
15
16
 
@@ -134,8 +135,13 @@ class DoubleSparseAttnBackend(AttentionBackend):
134
135
  )
135
136
 
136
137
  def init_forward_metadata_capture_cuda_graph(
137
- self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
138
+ self,
139
+ bs: int,
140
+ req_pool_indices: torch.Tensor,
141
+ seq_lens: torch.Tensor,
142
+ encoder_lens=None,
138
143
  ):
144
+ # NOTE: encoder_lens expected to be zeros or None
139
145
  self.forward_metadata = (
140
146
  self.cuda_graph_start_loc,
141
147
  self.cuda_graph_attn_logits,
@@ -144,15 +150,23 @@ class DoubleSparseAttnBackend(AttentionBackend):
144
150
  )
145
151
 
146
152
  def init_forward_metadata_replay_cuda_graph(
147
- self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
153
+ self,
154
+ bs: int,
155
+ req_pool_indices: torch.Tensor,
156
+ seq_lens: torch.Tensor,
157
+ seq_lens_sum: int,
158
+ encoder_lens=None,
148
159
  ):
160
+ # NOTE: encoder_lens expected to be zeros or None
149
161
  self.cuda_graph_start_loc.zero_()
150
162
  self.cuda_graph_start_loc[1:bs] = torch.cumsum(seq_lens[: bs - 1], dim=0)
151
163
 
152
164
  def get_cuda_graph_seq_len_fill_value(self):
153
165
  return 1
154
166
 
155
- def forward_extend(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
167
+ def forward_extend(
168
+ self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
169
+ ):
156
170
  # TODO: reuse the buffer across layers
157
171
  if layer.qk_head_dim != layer.v_head_dim:
158
172
  o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
@@ -168,7 +182,7 @@ class DoubleSparseAttnBackend(AttentionBackend):
168
182
  )
169
183
 
170
184
  forward_batch.token_to_kv_pool.set_kv_buffer(
171
- layer.layer_id, forward_batch.out_cache_loc, k, v, k_label
185
+ layer, forward_batch.out_cache_loc, k, v, k_label
172
186
  )
173
187
 
174
188
  (
@@ -197,7 +211,9 @@ class DoubleSparseAttnBackend(AttentionBackend):
197
211
  )
198
212
  return o
199
213
 
200
- def forward_decode(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
214
+ def forward_decode(
215
+ self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
216
+ ):
201
217
  # During torch.compile, there is a bug in rotary_emb that causes the
202
218
  # output value to have a 3D tensor shape. This reshapes the output correctly.
203
219
  q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
@@ -227,7 +243,7 @@ class DoubleSparseAttnBackend(AttentionBackend):
227
243
  )
228
244
 
229
245
  forward_batch.token_to_kv_pool.set_kv_buffer(
230
- layer.layer_id, forward_batch.out_cache_loc, k, v, k_label
246
+ layer, forward_batch.out_cache_loc, k, v, k_label
231
247
  )
232
248
 
233
249
  # NOTE(Andy) shouldn't be used when max_len_in_batch < heavy_token_num