sglang 0.4.0__tar.gz → 0.4.0.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. {sglang-0.4.0 → sglang-0.4.0.post1}/PKG-INFO +5 -4
  2. {sglang-0.4.0 → sglang-0.4.0.post1}/README.md +3 -2
  3. {sglang-0.4.0 → sglang-0.4.0.post1}/pyproject.toml +2 -2
  4. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/__init__.py +1 -1
  5. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/outlines_backend.py +5 -0
  6. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/xgrammar_backend.py +5 -5
  7. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/__init__.py +5 -2
  8. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
  9. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/flashinfer_backend.py +20 -5
  10. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/torch_native_backend.py +22 -8
  11. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_backend.py +22 -8
  12. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
  13. sglang-0.4.0.post1/sglang/srt/layers/ep_moe/__init__.py +0 -0
  14. sglang-0.4.0.post1/sglang/srt/layers/ep_moe/kernels.py +349 -0
  15. sglang-0.4.0.post1/sglang/srt/layers/ep_moe/layer.py +661 -0
  16. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/quantization/__init__.py +2 -2
  17. sglang-0.4.0.post1/sglang/srt/layers/quantization/fp8.py +559 -0
  18. sglang-0.4.0.post1/sglang/srt/layers/quantization/fp8_utils.py +27 -0
  19. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/radix_attention.py +4 -2
  20. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/sampler.py +2 -0
  21. sglang-0.4.0.post1/sglang/srt/layers/torchao_utils.py +73 -0
  22. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/schedule_batch.py +1 -0
  23. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/scheduler.py +69 -65
  24. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
  25. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/memory_pool.py +5 -1
  26. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_executor/cuda_graph_runner.py +15 -1
  27. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_executor/model_runner.py +11 -4
  28. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_parallel.py +1 -5
  29. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/commandr.py +2 -2
  30. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/deepseek_v2.py +87 -7
  31. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/grok.py +0 -5
  32. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama.py +0 -5
  33. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mixtral.py +12 -9
  34. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/phi3_small.py +0 -5
  35. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen2_moe.py +0 -5
  36. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/torch_native_llama.py +0 -5
  37. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/sampling_batch_info.py +9 -8
  38. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/server.py +3 -3
  39. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/server_args.py +43 -4
  40. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/utils.py +50 -0
  41. sglang-0.4.0.post1/sglang/version.py +1 -0
  42. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/PKG-INFO +5 -4
  43. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/SOURCES.txt +5 -0
  44. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/requires.txt +1 -1
  45. sglang-0.4.0/sglang/srt/layers/torchao_utils.py +0 -95
  46. sglang-0.4.0/sglang/version.py +0 -1
  47. {sglang-0.4.0 → sglang-0.4.0.post1}/LICENSE +0 -0
  48. {sglang-0.4.0 → sglang-0.4.0.post1}/setup.cfg +0 -0
  49. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/api.py +0 -0
  50. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_latency.py +0 -0
  51. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_offline_throughput.py +0 -0
  52. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_one_batch.py +0 -0
  53. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_one_batch_server.py +0 -0
  54. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/bench_serving.py +0 -0
  55. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/check_env.py +0 -0
  56. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/global_config.py +0 -0
  57. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/__init__.py +0 -0
  58. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/__init__.py +0 -0
  59. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/anthropic.py +0 -0
  60. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/base_backend.py +0 -0
  61. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/litellm.py +0 -0
  62. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/openai.py +0 -0
  63. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  64. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/backend/vertexai.py +0 -0
  65. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/chat_template.py +0 -0
  66. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/choices.py +0 -0
  67. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/compiler.py +0 -0
  68. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/interpreter.py +0 -0
  69. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/ir.py +0 -0
  70. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/lang/tracer.py +0 -0
  71. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/launch_server.py +0 -0
  72. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/launch_server_llavavid.py +0 -0
  73. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/_custom_ops.py +0 -0
  74. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/__init__.py +0 -0
  75. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/device_config.py +0 -0
  76. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/exaone.py +0 -0
  77. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/load_config.py +0 -0
  78. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/model_config.py +0 -0
  79. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/configs/qwen2vl.py +0 -0
  80. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/__init__.py +0 -0
  81. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  82. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  83. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/conversation.py +0 -0
  84. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/__init__.py +0 -0
  85. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/communication_op.py +0 -0
  86. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  87. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  88. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  89. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  90. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  91. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  92. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  93. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  94. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  95. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/parallel_state.py +0 -0
  96. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/distributed/utils.py +0 -0
  97. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/hf_transformers_utils.py +0 -0
  98. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/activation.py +0 -0
  99. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
  100. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  101. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  102. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/custom_op_util.py +0 -0
  103. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_patch.py +0 -0
  104. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_triton/__init__.py +0 -0
  105. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_triton/fused_moe.py +0 -0
  106. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/fused_moe_triton/layer.py +0 -0
  107. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/layernorm.py +0 -0
  108. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/linear.py +0 -0
  109. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/logits_processor.py +0 -0
  110. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/pooler.py +0 -0
  111. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
  112. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
  113. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  114. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/lora/lora.py +0 -0
  115. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/lora/lora_config.py +0 -0
  116. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/lora/lora_manager.py +0 -0
  117. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  118. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  119. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/image_processor.py +0 -0
  120. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/io_struct.py +0 -0
  121. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/schedule_policy.py +0 -0
  122. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/session_controller.py +0 -0
  123. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/tokenizer_manager.py +0 -0
  124. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/managers/tp_worker.py +0 -0
  125. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  126. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  127. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  128. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  129. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/metrics/collector.py +0 -0
  130. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/metrics/func_timer.py +0 -0
  131. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/mm_utils.py +0 -0
  132. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  133. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/__init__.py +0 -0
  134. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/loader.py +0 -0
  135. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/utils.py +0 -0
  136. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/model_loader/weight_utils.py +0 -0
  137. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/baichuan.py +0 -0
  138. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/chatglm.py +0 -0
  139. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/dbrx.py +0 -0
  140. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/deepseek.py +0 -0
  141. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/exaone.py +0 -0
  142. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gemma.py +0 -0
  143. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gemma2.py +0 -0
  144. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gemma2_reward.py +0 -0
  145. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gpt2.py +0 -0
  146. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
  147. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/internlm2.py +0 -0
  148. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/internlm2_reward.py +0 -0
  149. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama_classification.py +0 -0
  150. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama_embedding.py +0 -0
  151. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llama_reward.py +0 -0
  152. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llava.py +0 -0
  153. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/llavavid.py +0 -0
  154. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/minicpm.py +0 -0
  155. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/minicpm3.py +0 -0
  156. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mistral.py +0 -0
  157. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  158. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/mllama.py +0 -0
  159. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/olmo.py +0 -0
  160. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/olmo2.py +0 -0
  161. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/olmoe.py +0 -0
  162. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen.py +0 -0
  163. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen2.py +0 -0
  164. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/qwen2_vl.py +0 -0
  165. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/registry.py +0 -0
  166. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/stablelm.py +0 -0
  167. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/xverse.py +0 -0
  168. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/xverse_moe.py +0 -0
  169. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/models/yivl.py +0 -0
  170. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/openai_api/adapter.py +0 -0
  171. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/openai_api/protocol.py +0 -0
  172. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  173. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  174. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  175. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  176. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  177. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  178. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/srt/sampling/sampling_params.py +0 -0
  179. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/few_shot_gsm8k.py +0 -0
  180. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  181. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/run_eval.py +0 -0
  182. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/runners.py +0 -0
  183. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_common.py +0 -0
  184. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_gpqa.py +0 -0
  185. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_humaneval.py +0 -0
  186. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_math.py +0 -0
  187. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  188. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/simple_eval_mmlu.py +0 -0
  189. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  190. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_activation.py +0 -0
  191. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_layernorm.py +0 -0
  192. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_programs.py +0 -0
  193. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/test/test_utils.py +0 -0
  194. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang/utils.py +0 -0
  195. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/dependency_links.txt +0 -0
  196. {sglang-0.4.0 → sglang-0.4.0.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.0
3
+ Version: 0.4.0.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,7 +239,7 @@ Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
239
239
  Provides-Extra: srt
240
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
241
241
  Requires-Dist: torch; extra == "srt"
242
- Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
242
+ Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
243
243
  Requires-Dist: cuda-python; extra == "srt"
244
244
  Requires-Dist: flashinfer>=0.1.6; extra == "srt"
245
245
  Provides-Extra: srt-hip
@@ -315,6 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
315
315
  [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
316
 
317
317
  ## News
318
+ - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
318
319
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
319
320
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
320
321
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -346,13 +347,13 @@ The core features include:
346
347
  - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
347
348
 
348
349
  ## Benchmark And Performance
349
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
350
+ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
350
351
 
351
352
  ## Roadmap
352
353
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
353
354
 
354
355
  ## Adoption and Sponsorship
355
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
356
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
356
357
 
357
358
  ## Acknowledgment and Citation
358
359
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -16,6 +16,7 @@
16
16
  [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
17
17
 
18
18
  ## News
19
+ - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
19
20
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
20
21
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
21
22
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -47,13 +48,13 @@ The core features include:
47
48
  - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
48
49
 
49
50
  ## Benchmark And Performance
50
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
51
+ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
51
52
 
52
53
  ## Roadmap
53
54
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
54
55
 
55
56
  ## Adoption and Sponsorship
56
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
57
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
57
58
 
58
59
  ## Acknowledgment and Citation
59
60
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.0"
7
+ version = "0.4.0.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
23
23
  "psutil", "pydantic", "python-multipart",
24
24
  "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
25
  "xgrammar>=0.1.4"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer>=0.1.6"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -66,7 +66,7 @@ from sglang.version import __version__
66
66
 
67
67
  __all__ += ["__version__"]
68
68
 
69
- # SGL Backends
69
+ # SGLang Backends
70
70
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
71
71
  from sglang.utils import LazyImport
72
72
 
@@ -42,6 +42,7 @@ class OutlinesGrammar(BaseGrammarObject):
42
42
  self.guide = guide
43
43
  self.jump_forward_map = jump_forward_map
44
44
  self.state = 0
45
+ self.finished = False
45
46
 
46
47
  def accept_token(self, token: int):
47
48
  self.state = self.guide.get_next_state(self.state, token)
@@ -84,6 +85,10 @@ class OutlinesGrammar(BaseGrammarObject):
84
85
  ) -> torch.Tensor:
85
86
  return torch.zeros(batch_size, vocab_size, dtype=torch.bool, device=device)
86
87
 
88
+ @staticmethod
89
+ def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
90
+ return vocab_mask
91
+
87
92
  def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
88
93
  tokens = torch.tensor(
89
94
  self.guide.get_next_instruction(self.state).tokens, dtype=torch.int64
@@ -45,6 +45,7 @@ class XGrammarGrammar(BaseGrammarObject):
45
45
  self.matcher = matcher
46
46
  self.vocab_size = vocab_size
47
47
  self.ctx = ctx
48
+ self.finished = False
48
49
 
49
50
  def accept_token(self, token: int):
50
51
  assert self.matcher.accept_token(token)
@@ -85,12 +86,11 @@ class XGrammarGrammar(BaseGrammarObject):
85
86
  self.matcher.fill_next_token_bitmask(vocab_mask, idx)
86
87
 
87
88
  @staticmethod
88
- def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
89
- if vocab_mask.device.type != logits.device.type:
90
- # vocab_mask must then be on the same device as logits
91
- # when applying the token bitmask, so we check and move if needed
92
- vocab_mask = vocab_mask.to(logits.device)
89
+ def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
90
+ return vocab_mask.to(device, non_blocking=True)
93
91
 
92
+ @staticmethod
93
+ def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
94
94
  apply_token_bitmask_inplace(logits, vocab_mask)
95
95
 
96
96
  def copy(self):
@@ -52,12 +52,13 @@ class AttentionBackend(ABC):
52
52
  v: torch.Tensor,
53
53
  layer: RadixAttention,
54
54
  forward_batch: ForwardBatch,
55
+ save_kv_cache: bool = True,
55
56
  ):
56
57
  """Run forward on an attention layer."""
57
58
  if forward_batch.forward_mode.is_decode():
58
- return self.forward_decode(q, k, v, layer, forward_batch)
59
+ return self.forward_decode(q, k, v, layer, forward_batch, save_kv_cache)
59
60
  else:
60
- return self.forward_extend(q, k, v, layer, forward_batch)
61
+ return self.forward_extend(q, k, v, layer, forward_batch, save_kv_cache)
61
62
 
62
63
  def forward_decode(
63
64
  self,
@@ -66,6 +67,7 @@ class AttentionBackend(ABC):
66
67
  v: torch.Tensor,
67
68
  layer: RadixAttention,
68
69
  forward_batch: ForwardBatch,
70
+ save_kv_cache: bool = True,
69
71
  ):
70
72
  """Run a forward for decode."""
71
73
  raise NotImplementedError()
@@ -77,6 +79,7 @@ class AttentionBackend(ABC):
77
79
  v: torch.Tensor,
78
80
  layer: RadixAttention,
79
81
  forward_batch: ForwardBatch,
82
+ save_kv_cache: bool = True,
80
83
  ):
81
84
  """Run a forward for extend."""
82
85
  raise NotImplementedError()
@@ -165,7 +165,13 @@ class DoubleSparseAttnBackend(AttentionBackend):
165
165
  return 1
166
166
 
167
167
  def forward_extend(
168
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
168
+ self,
169
+ q,
170
+ k,
171
+ v,
172
+ layer: RadixAttention,
173
+ forward_batch: ForwardBatch,
174
+ save_kv_cache=True,
169
175
  ):
170
176
  # TODO: reuse the buffer across layers
171
177
  if layer.qk_head_dim != layer.v_head_dim:
@@ -181,9 +187,10 @@ class DoubleSparseAttnBackend(AttentionBackend):
181
187
  .expand(k.shape[0], -1, -1),
182
188
  )
183
189
 
184
- forward_batch.token_to_kv_pool.set_kv_buffer(
185
- layer, forward_batch.out_cache_loc, k, v, k_label
186
- )
190
+ if save_kv_cache:
191
+ forward_batch.token_to_kv_pool.set_kv_buffer(
192
+ layer, forward_batch.out_cache_loc, k, v, k_label
193
+ )
187
194
 
188
195
  (
189
196
  start_loc,
@@ -212,7 +219,13 @@ class DoubleSparseAttnBackend(AttentionBackend):
212
219
  return o
213
220
 
214
221
  def forward_decode(
215
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
222
+ self,
223
+ q,
224
+ k,
225
+ v,
226
+ layer: RadixAttention,
227
+ forward_batch: ForwardBatch,
228
+ save_kv_cache=True,
216
229
  ):
217
230
  # During torch.compile, there is a bug in rotary_emb that causes the
218
231
  # output value to have a 3D tensor shape. This reshapes the output correctly.
@@ -242,9 +255,10 @@ class DoubleSparseAttnBackend(AttentionBackend):
242
255
  .expand(k.shape[0], -1, -1),
243
256
  )
244
257
 
245
- forward_batch.token_to_kv_pool.set_kv_buffer(
246
- layer, forward_batch.out_cache_loc, k, v, k_label
247
- )
258
+ if save_kv_cache:
259
+ forward_batch.token_to_kv_pool.set_kv_buffer(
260
+ layer, forward_batch.out_cache_loc, k, v, k_label
261
+ )
248
262
 
249
263
  # NOTE(Andy) shouldn't be used when max_len_in_batch < heavy_token_num
250
264
  # and set a minimum value for sparse_decode
@@ -221,7 +221,13 @@ class FlashInferAttnBackend(AttentionBackend):
221
221
  return 0
222
222
 
223
223
  def forward_extend(
224
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
224
+ self,
225
+ q,
226
+ k,
227
+ v,
228
+ layer: RadixAttention,
229
+ forward_batch: ForwardBatch,
230
+ save_kv_cache=True,
225
231
  ):
226
232
  prefill_wrapper_paged = self.prefill_wrappers_paged[
227
233
  self._get_wrapper_idx(layer)
@@ -237,7 +243,8 @@ class FlashInferAttnBackend(AttentionBackend):
237
243
  if not use_ragged:
238
244
  if k is not None:
239
245
  assert v is not None
240
- forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
246
+ if save_kv_cache:
247
+ forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
241
248
 
242
249
  o = prefill_wrapper_paged.forward(
243
250
  q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
@@ -270,12 +277,19 @@ class FlashInferAttnBackend(AttentionBackend):
270
277
 
271
278
  o, _ = merge_state(o1, s1, o2, s2)
272
279
 
273
- forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
280
+ if save_kv_cache:
281
+ forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
274
282
 
275
283
  return o.view(-1, layer.tp_q_head_num * layer.head_dim)
276
284
 
277
285
  def forward_decode(
278
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
286
+ self,
287
+ q,
288
+ k,
289
+ v,
290
+ layer: RadixAttention,
291
+ forward_batch: ForwardBatch,
292
+ save_kv_cache=True,
279
293
  ):
280
294
  decode_wrapper = self.forward_metadata[0][self._get_wrapper_idx(layer)]
281
295
  cache_loc = (
@@ -286,7 +300,8 @@ class FlashInferAttnBackend(AttentionBackend):
286
300
 
287
301
  if k is not None:
288
302
  assert v is not None
289
- forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
303
+ if save_kv_cache:
304
+ forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
290
305
 
291
306
  o = decode_wrapper.forward(
292
307
  q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
@@ -216,16 +216,23 @@ class TorchNativeAttnBackend(AttentionBackend):
216
216
  return output
217
217
 
218
218
  def forward_extend(
219
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
219
+ self,
220
+ q,
221
+ k,
222
+ v,
223
+ layer: RadixAttention,
224
+ forward_batch: ForwardBatch,
225
+ save_kv_cache=True,
220
226
  ):
221
227
  if layer.qk_head_dim != layer.v_head_dim:
222
228
  o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
223
229
  else:
224
230
  o = torch.empty_like(q)
225
231
 
226
- forward_batch.token_to_kv_pool.set_kv_buffer(
227
- layer, forward_batch.out_cache_loc, k, v
228
- )
232
+ if save_kv_cache:
233
+ forward_batch.token_to_kv_pool.set_kv_buffer(
234
+ layer, forward_batch.out_cache_loc, k, v
235
+ )
229
236
 
230
237
  use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
231
238
 
@@ -249,7 +256,13 @@ class TorchNativeAttnBackend(AttentionBackend):
249
256
  return o
250
257
 
251
258
  def forward_decode(
252
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
259
+ self,
260
+ q,
261
+ k,
262
+ v,
263
+ layer: RadixAttention,
264
+ forward_batch: ForwardBatch,
265
+ save_kv_cache=True,
253
266
  ):
254
267
  # During torch.compile, there is a bug in rotary_emb that causes the
255
268
  # output value to have a 3D tensor shape. This reshapes the output correctly.
@@ -260,9 +273,10 @@ class TorchNativeAttnBackend(AttentionBackend):
260
273
  else:
261
274
  o = torch.empty_like(q)
262
275
 
263
- forward_batch.token_to_kv_pool.set_kv_buffer(
264
- layer, forward_batch.out_cache_loc, k, v
265
- )
276
+ if save_kv_cache:
277
+ forward_batch.token_to_kv_pool.set_kv_buffer(
278
+ layer, forward_batch.out_cache_loc, k, v
279
+ )
266
280
 
267
281
  use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
268
282
 
@@ -114,7 +114,13 @@ class TritonAttnBackend(AttentionBackend):
114
114
  return 1
115
115
 
116
116
  def forward_extend(
117
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
117
+ self,
118
+ q,
119
+ k,
120
+ v,
121
+ layer: RadixAttention,
122
+ forward_batch: ForwardBatch,
123
+ save_kv_cache=True,
118
124
  ):
119
125
  # TODO: reuse the buffer across layers
120
126
  if layer.qk_head_dim != layer.v_head_dim:
@@ -122,9 +128,10 @@ class TritonAttnBackend(AttentionBackend):
122
128
  else:
123
129
  o = torch.empty_like(q)
124
130
 
125
- forward_batch.token_to_kv_pool.set_kv_buffer(
126
- layer, forward_batch.out_cache_loc, k, v
127
- )
131
+ if save_kv_cache:
132
+ forward_batch.token_to_kv_pool.set_kv_buffer(
133
+ layer, forward_batch.out_cache_loc, k, v
134
+ )
128
135
 
129
136
  start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
130
137
  self.extend_attention_fwd(
@@ -146,7 +153,13 @@ class TritonAttnBackend(AttentionBackend):
146
153
  return o
147
154
 
148
155
  def forward_decode(
149
- self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
156
+ self,
157
+ q,
158
+ k,
159
+ v,
160
+ layer: RadixAttention,
161
+ forward_batch: ForwardBatch,
162
+ save_kv_cache=True,
150
163
  ):
151
164
  # During torch.compile, there is a bug in rotary_emb that causes the
152
165
  # output value to have a 3D tensor shape. This reshapes the output correctly.
@@ -160,9 +173,10 @@ class TritonAttnBackend(AttentionBackend):
160
173
 
161
174
  start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
162
175
 
163
- forward_batch.token_to_kv_pool.set_kv_buffer(
164
- layer, forward_batch.out_cache_loc, k, v
165
- )
176
+ if save_kv_cache:
177
+ forward_batch.token_to_kv_pool.set_kv_buffer(
178
+ layer, forward_batch.out_cache_loc, k, v
179
+ )
166
180
 
167
181
  self.decode_attention_fwd(
168
182
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
@@ -284,6 +284,9 @@ def extend_attention_fwd(
284
284
  elif Lq == 288:
285
285
  BLOCK_DMODEL = 256
286
286
  BLOCK_DPE = 32
287
+ elif Lq == 192:
288
+ BLOCK_DMODEL = 128
289
+ BLOCK_DPE = 64
287
290
  else:
288
291
  BLOCK_DMODEL = triton.next_power_of_2(Lq)
289
292
  BLOCK_DPE = 0